Commit 3af90c7a authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

Merge branch 'cased_update' into 'master'

added case wordpiece arguments

See merge request ADLR/megatron-lm!76
parents 03d28809 26c5f12a
...@@ -52,9 +52,14 @@ def parse_args(extra_args_provider=None, defaults={}, ...@@ -52,9 +52,14 @@ def parse_args(extra_args_provider=None, defaults={},
# For default to be valid, it should not be provided in the # For default to be valid, it should not be provided in the
# arguments that are passed to the program. We check this by # arguments that are passed to the program. We check this by
# ensuring the arg is set to None. # ensuring the arg is set to None.
assert getattr(args, key) is None, \ if getattr(args, key) is not None:
'defaults can only be overwritten for args with None values.' if args.rank == 0:
setattr(args, key, defaults[key]) print('WARNING: overriding default arguments for {key}:{v} \
with {key}:{v2}'.format(key=key, v=defaults[key],
v2=getattr(args, key)),
flush=True)
else:
setattr(args, key, defaults[key])
# Check required arguments. # Check required arguments.
required_args = ['num_layers', 'hidden_size', 'num_attention_heads', required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
...@@ -332,6 +337,7 @@ def _add_data_args(parser): ...@@ -332,6 +337,7 @@ def _add_data_args(parser):
group.add_argument('--tokenizer-type', type=str, group.add_argument('--tokenizer-type', type=str,
default=None, default=None,
choices=['BertWordPieceLowerCase', choices=['BertWordPieceLowerCase',
'BertWordPieceCase',
'GPT2BPETokenizer'], 'GPT2BPETokenizer'],
help='What type of tokenizer to use.') help='What type of tokenizer to use.')
group.add_argument('--data-impl', type=str, default='infer', group.add_argument('--data-impl', type=str, default='infer',
......
...@@ -33,6 +33,9 @@ def build_tokenizer(args): ...@@ -33,6 +33,9 @@ def build_tokenizer(args):
if args.tokenizer_type == 'BertWordPieceLowerCase': if args.tokenizer_type == 'BertWordPieceLowerCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=True) lower_case=True)
elif args.tokenizer_type == 'BertWordPieceCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=False)
elif args.tokenizer_type == 'GPT2BPETokenizer': elif args.tokenizer_type == 'GPT2BPETokenizer':
assert args.merge_file is not None assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
......
...@@ -104,7 +104,7 @@ def get_args(): ...@@ -104,7 +104,7 @@ def get_args():
group = parser.add_argument_group(title='tokenizer') group = parser.add_argument_group(title='tokenizer')
group.add_argument('--tokenizer-type', type=str, required=True, group.add_argument('--tokenizer-type', type=str, required=True,
choices=['BertWordPieceLowerCase', choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer'], 'GPT2BPETokenizer'],
help='What type of tokenizer to use.') help='What type of tokenizer to use.')
group.add_argument('--vocab-file', type=str, default=None, group.add_argument('--vocab-file', type=str, default=None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment