Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
a44360ed
Commit
a44360ed
authored
Aug 07, 2022
by
Vijay Korthikanti
Browse files
adress review comments
parent
45f4ee54
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
16 additions
and
215 deletions
+16
-215
megatron/arguments.py
megatron/arguments.py
+1
-3
megatron/tokenizer/tokenizer.py
megatron/tokenizer/tokenizer.py
+15
-212
No files found.
megatron/arguments.py
View file @
a44360ed
...
@@ -851,12 +851,10 @@ def _add_data_args(parser):
...
@@ -851,12 +851,10 @@ def _add_data_args(parser):
choices
=
[
'BertWordPieceLowerCase'
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
'BertWordPieceCase'
,
'GPT2BPETokenizer'
,
'GPT2BPETokenizer'
,
'YTTMTokenizer'
,
'ByteLevelBPETokenizer'
,
'SentencePieceTokenizer'
],
'SentencePieceTokenizer'
],
help
=
'What type of tokenizer to use.'
)
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--tokenizer-model'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--tokenizer-model'
,
type
=
str
,
default
=
None
,
help
=
'
YTTM
tokenizer model.'
)
help
=
'
Sentencepiece
tokenizer model.'
)
group
.
add_argument
(
'--data-impl'
,
type
=
str
,
default
=
'infer'
,
group
.
add_argument
(
'--data-impl'
,
type
=
str
,
default
=
'infer'
,
choices
=
[
'lazy'
,
'cached'
,
'mmap'
,
'infer'
],
choices
=
[
'lazy'
,
'cached'
,
'mmap'
,
'infer'
],
help
=
'Implementation of indexed datasets.'
)
help
=
'Implementation of indexed datasets.'
)
...
...
megatron/tokenizer/tokenizer.py
View file @
a44360ed
...
@@ -20,9 +20,6 @@ from abc import abstractmethod
...
@@ -20,9 +20,6 @@ from abc import abstractmethod
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
import
sentencepiece
import
tokenizers
import
youtokentome
as
yttm
def
build_tokenizer
(
args
):
def
build_tokenizer
(
args
):
...
@@ -44,13 +41,6 @@ def build_tokenizer(args):
...
@@ -44,13 +41,6 @@ def build_tokenizer(args):
elif
args
.
tokenizer_type
==
'GPT2BPETokenizer'
:
elif
args
.
tokenizer_type
==
'GPT2BPETokenizer'
:
assert
args
.
merge_file
is
not
None
assert
args
.
merge_file
is
not
None
tokenizer
=
_GPT2BPETokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
tokenizer
=
_GPT2BPETokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
elif
args
.
tokenizer_type
==
'YTTMTokenizer'
:
assert
args
.
tokenizer_model
is
not
None
tokenizer
=
_YTTMTokenizer
(
args
.
tokenizer_model
,
vocab_extra_ids
=
args
.
vocab_extra_ids
)
elif
args
.
tokenizer_type
==
'ByteLevelBPETokenizer'
:
assert
args
.
vocab_file
is
not
None
assert
args
.
merge_file
is
not
None
tokenizer
=
_ByteLevelBPETokenizer
(
args
.
vocab_file
,
args
.
merge_file
,
vocab_extra_ids
=
args
.
vocab_extra_ids
)
elif
args
.
tokenizer_type
==
'SentencePieceTokenizer'
:
elif
args
.
tokenizer_type
==
'SentencePieceTokenizer'
:
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tokenizer_model
is
not
None
tokenizer
=
_SentencePieceTokenizer
(
args
.
tokenizer_model
,
vocab_extra_ids
=
args
.
vocab_extra_ids
)
tokenizer
=
_SentencePieceTokenizer
(
args
.
tokenizer_model
,
vocab_extra_ids
=
args
.
vocab_extra_ids
)
...
@@ -304,200 +294,6 @@ class _GPT2BPETokenizer(AbstractTokenizer):
...
@@ -304,200 +294,6 @@ class _GPT2BPETokenizer(AbstractTokenizer):
return
self
.
eod_id
return
self
.
eod_id
class
_YTTMTokenizer
(
AbstractTokenizer
):
""" YTTM tokenizer."""
def
__init__
(
self
,
model_path
,
vocab_extra_ids
=
0
):
name
=
'YTTM'
super
().
__init__
(
name
)
self
.
bpe
=
yttm
.
BPE
(
model
=
model_path
)
self
.
vocab_
=
{}
self
.
inv_vocab_
=
{}
self
.
_additional_special_tokens
=
[]
self
.
_initalize
(
vocab_extra_ids
)
def
_initalize
(
self
,
vocab_extra_ids
):
for
subword
in
self
.
bpe
.
vocab
():
self
.
add_token
(
subword
)
self
.
add_token
(
'<CLS>'
);
self
.
cls_id
=
self
.
vocab_
[
'<CLS>'
]
self
.
add_token
(
'<SEP>'
);
self
.
sep_id
=
self
.
vocab_
[
'<SEP>'
]
self
.
add_token
(
'<PAD>'
);
self
.
pad_id
=
self
.
vocab_
[
'<PAD>'
]
self
.
add_token
(
'<BOS>'
);
self
.
bos_id
=
self
.
vocab_
[
'<BOS>'
]
self
.
add_token
(
'<EOS>'
);
self
.
eos_id
=
self
.
vocab_
[
'<EOS>'
]
self
.
add_token
(
'<EOD>'
);
self
.
eod_id
=
self
.
vocab_
[
'<EOD>'
]
self
.
add_token
(
'<MASK>'
);
self
.
mask_id
=
self
.
vocab_
[
'<MASK>'
]
self
.
special_token_ids
=
[
self
.
cls_id
,
self
.
sep_id
,
self
.
pad_id
,
self
.
bos_id
,
self
.
eos_id
,
self
.
eod_id
,
self
.
mask_id
]
self
.
add_additional_special_tokens
([
"<extra_id_{}>"
.
format
(
i
)
for
i
in
range
(
vocab_extra_ids
)
])
def
add_token
(
self
,
token
):
if
token
not
in
self
.
vocab
:
self
.
inv_vocab
[
self
.
vocab_size
]
=
token
self
.
vocab
[
token
]
=
self
.
vocab_size
def
add_additional_special_tokens
(
self
,
tokens
):
for
token
in
tokens
:
if
token
not
in
self
.
vocab
:
self
.
_additional_special_tokens
.
append
(
token
)
self
.
special_token_ids
.
append
(
token
)
self
.
add_token
(
token
)
@
property
def
vocab_size
(
self
):
return
len
(
self
.
vocab_
)
@
property
def
vocab
(
self
):
return
self
.
vocab_
@
property
def
inv_vocab
(
self
):
return
self
.
inv_vocab_
def
tokenize
(
self
,
text
):
return
self
.
bpe
.
encode
([
text
],
output_type
=
yttm
.
OutputType
.
ID
)[
0
]
def
detokenize
(
self
,
token_ids
):
return
self
.
bpe
.
decode
([
token_ids
],
ignore_ids
=
self
.
special_token_ids
)[
0
]
@
property
def
cls
(
self
):
return
self
.
cls_id
@
property
def
sep
(
self
):
return
self
.
sep_id
@
property
def
pad
(
self
):
return
self
.
pad_id
@
property
def
bos_token_id
(
self
):
return
self
.
bos_id
@
property
def
bos
(
self
):
return
self
.
bos_id
@
property
def
eod
(
self
):
return
self
.
eod_id
@
property
def
eos_token_id
(
self
):
return
self
.
eos_id
@
property
def
eos
(
self
):
return
self
.
eos_id
@
property
def
mask
(
self
):
return
self
.
mask_id
@
property
def
additional_special_tokens_ids
(
self
):
return
[
self
.
vocab
.
get
(
token
)
for
token
in
self
.
_additional_special_tokens
]
class
_ByteLevelBPETokenizer
(
AbstractTokenizer
):
"""ByteLevelBPETokenizer that can support T5 pretraining."""
def
__init__
(
self
,
vocab_file
,
merges_file
,
vocab_extra_ids
=
0
):
name
=
'ByteLevelBPETokenizer'
super
().
__init__
(
name
)
self
.
_bpe
=
tokenizers
.
ByteLevelBPETokenizer
(
vocab
=
vocab_file
,
merges
=
merges_file
)
self
.
_inv_vocab
=
{}
self
.
_additional_special_tokens
=
[]
self
.
_initalize
(
vocab_extra_ids
)
def
_initalize
(
self
,
vocab_extra_ids
):
self
.
_bpe
.
add_special_tokens
([
'<CLS>'
,
'<SEP>'
,
'<PAD>'
,
'<BOS>'
,
'<EOS>'
,
'<EOD>'
,
'<MASK>'
])
self
.
_cls_id
=
self
.
vocab
[
'<CLS>'
]
self
.
_sep_id
=
self
.
vocab
[
'<SEP>'
]
self
.
_pad_id
=
self
.
vocab
[
'<PAD>'
]
self
.
_bos_id
=
self
.
vocab
[
'<BOS>'
]
self
.
_eos_id
=
self
.
vocab
[
'<EOS>'
]
self
.
_eod_id
=
self
.
vocab
[
'<EOD>'
]
self
.
_mask_id
=
self
.
vocab
[
'<MASK>'
]
t5_tokens
=
[
"<extra_id_{}>"
.
format
(
i
)
for
i
in
range
(
vocab_extra_ids
)]
self
.
_bpe
.
add_special_tokens
(
t5_tokens
)
self
.
_additional_special_tokens
=
t5_tokens
@
property
def
vocab_size
(
self
):
return
self
.
_bpe
.
get_vocab_size
()
@
property
def
vocab
(
self
):
return
self
.
_bpe
.
get_vocab
()
@
property
def
inv_vocab
(
self
):
vocab
=
self
.
vocab
if
len
(
self
.
_inv_vocab
)
!=
len
(
vocab
):
self
.
_inv_vocab
=
{}
for
(
k
,
v
)
in
vocab
.
items
():
self
.
_inv_vocab
[
v
]
=
k
return
self
.
_inv_vocab
def
tokenize
(
self
,
text
):
return
self
.
_bpe
.
encode
(
text
).
ids
def
detokenize
(
self
,
token_ids
):
return
self
.
_bpe
.
decode
(
token_ids
)
@
property
def
cls
(
self
):
return
self
.
_cls_id
@
property
def
sep
(
self
):
return
self
.
_sep_id
@
property
def
pad
(
self
):
return
self
.
_pad_id
@
property
def
bos_token_id
(
self
):
return
self
.
_bos_id
@
property
def
bos
(
self
):
return
self
.
_bos_id
@
property
def
eod
(
self
):
return
self
.
_eod_id
@
property
def
eos_token_id
(
self
):
return
self
.
_eos_id
@
property
def
eos
(
self
):
return
self
.
_eos_id
@
property
def
mask
(
self
):
return
self
.
_mask_id
@
property
def
additional_special_tokens_ids
(
self
):
return
[
self
.
vocab
.
get
(
token
)
for
token
in
self
.
_additional_special_tokens
]
class
_SentencePieceTokenizer
(
AbstractTokenizer
):
class
_SentencePieceTokenizer
(
AbstractTokenizer
):
"""SentencePieceTokenizer-Megatron wrapper"""
"""SentencePieceTokenizer-Megatron wrapper"""
...
@@ -505,6 +301,7 @@ class _SentencePieceTokenizer(AbstractTokenizer):
...
@@ -505,6 +301,7 @@ class _SentencePieceTokenizer(AbstractTokenizer):
name
=
'SentencePieceTokenizer'
name
=
'SentencePieceTokenizer'
super
().
__init__
(
name
)
super
().
__init__
(
name
)
import
sentencepiece
self
.
_tokenizer
=
sentencepiece
.
SentencePieceProcessor
(
model_file
=
model_file
)
self
.
_tokenizer
=
sentencepiece
.
SentencePieceProcessor
(
model_file
=
model_file
)
self
.
_initalize
(
vocab_extra_ids
)
self
.
_initalize
(
vocab_extra_ids
)
...
@@ -530,31 +327,38 @@ class _SentencePieceTokenizer(AbstractTokenizer):
...
@@ -530,31 +327,38 @@ class _SentencePieceTokenizer(AbstractTokenizer):
self
.
_special_tokens
[
t
]
=
self
.
_vocab
[
t
]
self
.
_special_tokens
[
t
]
=
self
.
_vocab
[
t
]
self
.
_inv_special_tokens
[
self
.
_vocab
[
t
]]
=
t
self
.
_inv_special_tokens
[
self
.
_vocab
[
t
]]
=
t
_add_special_token
(
'<CLS>'
);
self
.
_cls_id
=
self
.
_vocab
[
'<CLS>'
]
_add_special_token
(
'<CLS>'
)
_add_special_token
(
'<SEP>'
);
self
.
_sep_id
=
self
.
_vocab
[
'<SEP>'
]
self
.
_cls_id
=
self
.
_vocab
[
'<CLS>'
]
_add_special_token
(
'<EOD>'
);
self
.
_eod_id
=
self
.
_vocab
[
'<EOD>'
]
_add_special_token
(
'<SEP>'
)
_add_special_token
(
'<MASK>'
);
self
.
_mask_id
=
self
.
_vocab
[
'<MASK>'
]
self
.
_sep_id
=
self
.
_vocab
[
'<SEP>'
]
_add_special_token
(
'<EOD>'
)
self
.
_eod_id
=
self
.
_vocab
[
'<EOD>'
]
_add_special_token
(
'<MASK>'
)
self
.
_mask_id
=
self
.
_vocab
[
'<MASK>'
]
pad_id
=
self
.
_tokenizer
.
pad_id
()
pad_id
=
self
.
_tokenizer
.
pad_id
()
try
:
try
:
pad_token
=
self
.
_tokenizer
.
id_to_piece
(
pad_id
)
pad_token
=
self
.
_tokenizer
.
id_to_piece
(
pad_id
)
except
IndexError
:
except
IndexError
:
pad_token
=
'<PAD>'
pad_token
=
'<PAD>'
_add_special_token
(
pad_token
);
self
.
_pad_id
=
self
.
_vocab
[
pad_token
]
_add_special_token
(
pad_token
)
self
.
_pad_id
=
self
.
_vocab
[
pad_token
]
bos_id
=
self
.
_tokenizer
.
bos_id
()
bos_id
=
self
.
_tokenizer
.
bos_id
()
try
:
try
:
bos_token
=
self
.
_tokenizer
.
id_to_piece
(
bos_id
)
bos_token
=
self
.
_tokenizer
.
id_to_piece
(
bos_id
)
except
IndexError
:
except
IndexError
:
bos_token
=
'<BOS>'
bos_token
=
'<BOS>'
_add_special_token
(
bos_token
);
self
.
_bos_id
=
self
.
_vocab
[
bos_token
]
_add_special_token
(
bos_token
)
self
.
_bos_id
=
self
.
_vocab
[
bos_token
]
eos_id
=
self
.
_tokenizer
.
eos_id
()
eos_id
=
self
.
_tokenizer
.
eos_id
()
try
:
try
:
eos_token
=
self
.
_tokenizer
.
id_to_piece
(
eos_id
)
eos_token
=
self
.
_tokenizer
.
id_to_piece
(
eos_id
)
except
IndexError
:
except
IndexError
:
eos_token
=
'<EOS>'
eos_token
=
'<EOS>'
_add_special_token
(
eos_token
);
self
.
_eos_id
=
self
.
_vocab
[
eos_token
]
_add_special_token
(
eos_token
)
self
.
_eos_id
=
self
.
_vocab
[
eos_token
]
for
i
in
range
(
vocab_extra_ids
):
for
i
in
range
(
vocab_extra_ids
):
t
=
"<extra_id_{}>"
.
format
(
i
)
t
=
"<extra_id_{}>"
.
format
(
i
)
...
@@ -578,7 +382,6 @@ class _SentencePieceTokenizer(AbstractTokenizer):
...
@@ -578,7 +382,6 @@ class _SentencePieceTokenizer(AbstractTokenizer):
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
):
ids
=
[]
ids
=
[]
idx
=
0
idx
=
0
last_idx
=
0
while
1
:
while
1
:
indices
=
{}
indices
=
{}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment