Commit ef5b2f06 authored by Raul Puri's avatar Raul Puri
Browse files

added case wordpiece arguments

parent 03d28809
...@@ -332,6 +332,7 @@ def _add_data_args(parser): ...@@ -332,6 +332,7 @@ def _add_data_args(parser):
group.add_argument('--tokenizer-type', type=str, group.add_argument('--tokenizer-type', type=str,
default=None, default=None,
choices=['BertWordPieceLowerCase', choices=['BertWordPieceLowerCase',
'BertWordPieceCase',
'GPT2BPETokenizer'], 'GPT2BPETokenizer'],
help='What type of tokenizer to use.') help='What type of tokenizer to use.')
group.add_argument('--data-impl', type=str, default='infer', group.add_argument('--data-impl', type=str, default='infer',
......
...@@ -33,6 +33,9 @@ def build_tokenizer(args): ...@@ -33,6 +33,9 @@ def build_tokenizer(args):
if args.tokenizer_type == 'BertWordPieceLowerCase': if args.tokenizer_type == 'BertWordPieceLowerCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=True) lower_case=True)
elif args.tokenizer_type == 'BertWordPieceCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=False)
elif args.tokenizer_type == 'GPT2BPETokenizer': elif args.tokenizer_type == 'GPT2BPETokenizer':
assert args.merge_file is not None assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
......
...@@ -104,7 +104,7 @@ def get_args(): ...@@ -104,7 +104,7 @@ def get_args():
group = parser.add_argument_group(title='tokenizer') group = parser.add_argument_group(title='tokenizer')
group.add_argument('--tokenizer-type', type=str, required=True, group.add_argument('--tokenizer-type', type=str, required=True,
choices=['BertWordPieceLowerCase', choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer'], 'GPT2BPETokenizer'],
help='What type of tokenizer to use.') help='What type of tokenizer to use.')
group.add_argument('--vocab-file', type=str, default=None, group.add_argument('--vocab-file', type=str, default=None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment