Commit 1e283449 authored by Jared Casper's avatar Jared Casper
Browse files

Add sentence or document suffix to output filename of preprocess_data.py

parent 985f027d
...@@ -137,14 +137,20 @@ def main(): ...@@ -137,14 +137,20 @@ def main():
encoded_docs = pool.imap(encoder.encode, fin, 25) encoded_docs = pool.imap(encoder.encode, fin, 25)
#encoded_docs = map(encoder.encode, fin) #encoded_docs = map(encoder.encode, fin)
level = "document"
if args.split_sentences:
level = "sentence"
print(f"Vocab size: {tokenizer.vocab_size}") print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Output prefix: {args.output_prefix}") print(f"Output prefix: {args.output_prefix}")
output_bin_files = {} output_bin_files = {}
output_idx_files = {} output_idx_files = {}
builders = {} builders = {}
for key in args.json_keys: for key in args.json_keys:
output_bin_files[key] = "{}_{}.bin".format(args.output_prefix, key) output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
output_idx_files[key] = "{}_{}.idx".format(args.output_prefix, key) key, level)
output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
key, level)
builders[key] = indexed_dataset.make_builder(output_bin_files[key], builders[key] = indexed_dataset.make_builder(output_bin_files[key],
impl=args.dataset_impl, impl=args.dataset_impl,
vocab_size=tokenizer.vocab_size) vocab_size=tokenizer.vocab_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment