Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
ef5b2f06
Commit
ef5b2f06
authored
Apr 24, 2020
by
Raul Puri
Browse files
added case wordpiece arguments
parent
03d28809
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
5 additions
and
1 deletion
+5
-1
megatron/arguments.py
megatron/arguments.py
+1
-0
megatron/tokenizer/tokenizer.py
megatron/tokenizer/tokenizer.py
+3
-0
tools/preprocess_data.py
tools/preprocess_data.py
+1
-1
No files found.
megatron/arguments.py
View file @
ef5b2f06
...
@@ -332,6 +332,7 @@ def _add_data_args(parser):
...
@@ -332,6 +332,7 @@ def _add_data_args(parser):
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
default
=
None
,
default
=
None
,
choices
=
[
'BertWordPieceLowerCase'
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
'GPT2BPETokenizer'
],
'GPT2BPETokenizer'
],
help
=
'What type of tokenizer to use.'
)
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--data-impl'
,
type
=
str
,
default
=
'infer'
,
group
.
add_argument
(
'--data-impl'
,
type
=
str
,
default
=
'infer'
,
...
...
megatron/tokenizer/tokenizer.py
View file @
ef5b2f06
...
@@ -33,6 +33,9 @@ def build_tokenizer(args):
...
@@ -33,6 +33,9 @@ def build_tokenizer(args):
if
args
.
tokenizer_type
==
'BertWordPieceLowerCase'
:
if
args
.
tokenizer_type
==
'BertWordPieceLowerCase'
:
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab_file
,
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab_file
,
lower_case
=
True
)
lower_case
=
True
)
elif
args
.
tokenizer_type
==
'BertWordPieceCase'
:
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab_file
,
lower_case
=
False
)
elif
args
.
tokenizer_type
==
'GPT2BPETokenizer'
:
elif
args
.
tokenizer_type
==
'GPT2BPETokenizer'
:
assert
args
.
merge_file
is
not
None
assert
args
.
merge_file
is
not
None
tokenizer
=
_GPT2BPETokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
tokenizer
=
_GPT2BPETokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
...
...
tools/preprocess_data.py
View file @
ef5b2f06
...
@@ -104,7 +104,7 @@ def get_args():
...
@@ -104,7 +104,7 @@ def get_args():
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
required
=
True
,
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
required
=
True
,
choices
=
[
'BertWordPieceLowerCase'
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
'GPT2BPETokenizer'
],
'GPT2BPETokenizer'
],
help
=
'What type of tokenizer to use.'
)
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment