Commit 3207c19a authored by Jared Casper's avatar Jared Casper
Browse files

Missed some changes from next-best-lm branch.

parent 46ffb75c
......@@ -89,7 +89,7 @@ def set_global_variables(args):
set_args(args)
_build_num_microbatches_calculator(args)
if args.vocab_file:
if args.vocab_file or args.tokenizer_model:
_ = _build_tokenizer(args)
_set_tensorboard_writer(args)
_set_adlr_autoresume(args)
......
......@@ -104,6 +104,8 @@ def get_args():
help='Append an <eod> token to the end of a document.')
group.add_argument('--lang', type=str, default='english',
help='Language to use for NLTK-powered sentence splitting.')
group.add_argument('--tokenizer-model', type=str, default=None,
help='sentencepeice tokenizer model.')
group = parser.add_argument_group(title='output data')
......
......@@ -326,6 +326,9 @@ def main():
for p in processes:
p.join()
if args.partitions == 1:
return
# encode partition files in parallel
processes = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment