"vscode:/vscode.git/clone" did not exist on "b03d2621a79f5bd13cba0029879c8671cdd2a0d0"
Commit 3af90c7a authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

Merge branch 'cased_update' into 'master'

added case wordpiece arguments

See merge request ADLR/megatron-lm!76
parents 03d28809 26c5f12a
......@@ -52,9 +52,14 @@ def parse_args(extra_args_provider=None, defaults={},
# For default to be valid, it should not be provided in the
# arguments that are passed to the program. We check this by
# ensuring the arg is set to None.
assert getattr(args, key) is None, \
'defaults can only be overwritten for args with None values.'
setattr(args, key, defaults[key])
if getattr(args, key) is not None:
if args.rank == 0:
print('WARNING: overriding default arguments for {key}:{v} \
with {key}:{v2}'.format(key=key, v=defaults[key],
v2=getattr(args, key)),
flush=True)
else:
setattr(args, key, defaults[key])
# Check required arguments.
required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
......@@ -332,6 +337,7 @@ def _add_data_args(parser):
group.add_argument('--tokenizer-type', type=str,
default=None,
choices=['BertWordPieceLowerCase',
'BertWordPieceCase',
'GPT2BPETokenizer'],
help='What type of tokenizer to use.')
group.add_argument('--data-impl', type=str, default='infer',
......
......@@ -33,6 +33,9 @@ def build_tokenizer(args):
if args.tokenizer_type == 'BertWordPieceLowerCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=True)
elif args.tokenizer_type == 'BertWordPieceCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=False)
elif args.tokenizer_type == 'GPT2BPETokenizer':
assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
......
......@@ -104,7 +104,7 @@ def get_args():
group = parser.add_argument_group(title='tokenizer')
group.add_argument('--tokenizer-type', type=str, required=True,
choices=['BertWordPieceLowerCase',
choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer'],
help='What type of tokenizer to use.')
group.add_argument('--vocab-file', type=str, default=None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment