Add sentence or document suffix to output filename of preprocess_data.py

1e283449 · Jared Casper · 985f027d · 1e283449
Commit 1e283449 authored Apr 08, 2020 by Jared Casper
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

tools/preprocess_data.py tools/preprocess_data.py +8 -2

No files found.
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -137,14 +137,20 @@ def main():
    encoded_docs = pool.imap(encoder.encode, fin, 25)
    #encoded_docs = map(encoder.encode, fin)

+    level = "document"
+    if args.split_sentences:
+        level = "sentence"
+
    print(f"Vocab size: {tokenizer.vocab_size}")
    print(f"Output prefix: {args.output_prefix}")
    output_bin_files = {}
    output_idx_files = {}
    builders = {}
    for key in args.json_keys:
-        output_bin_files[key] = "{}_{}.bin".format(args.output_prefix, key)
-        output_idx_files[key] = "{}_{}.idx".format(args.output_prefix, key)
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
+                                                      key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
+                                                      key, level)
        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
                                               impl=args.dataset_impl,
                                               vocab_size=tokenizer.vocab_size)