bug fixes in partitioned data preprocessor

84a43b13 · Mike Chrzanowski · Jared Casper · 8ce8256f · 84a43b13 · 84a43b13
Commit 84a43b13 authored Nov 29, 2022 by Mike Chrzanowski Committed by Jared Casper Nov 29, 2022
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 6 deletions

megatron/tokenizer/tokenizer.py megatron/tokenizer/tokenizer.py +3 -1

tools/preprocess_data_partitions.py tools/preprocess_data_partitions.py +9 -5

No files found.
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -15,8 +15,10 @@ def build_tokenizer(args):
        print('> building {} tokenizer ...'.format(args.tokenizer_type),
              flush=True)
-    # Select and instantiate the tokenizer.
+    if args.tokenizer_type != 'SentencePieceTokenizer':
        assert args.vocab_file is not None
+    # Select and instantiate the tokenizer.
    if args.tokenizer_type == 'BertWordPieceLowerCase':
        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
                                            lower_case=True,

--- a/tools/preprocess_data_partitions.py
+++ b/tools/preprocess_data_partitions.py
@@ -174,6 +174,7 @@ class Partition(object):
            self.print_processing_stats(i, proc_start, total_bytes_processed)
        fin.close()
+        builders[key].finalize(output_idx_files[key])
 def get_args():
@@ -219,8 +220,7 @@ def get_args():
    args = parser.parse_args()
    args.keep_empty = False
-    if (args.tokenizer_type.lower().startswith('bert')
+    if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences:
-        if not args.split_sentences:
        print("Are you sure you don't want to split sentences?")
    # some default/dummy values for the tokenizer
@@ -265,7 +265,11 @@ def main():
    if args.partitions == 1:
        file_name, extension = os.path.splitext(args.input)
        sentence_split_file = file_name + "_ss" + extension
-        in_ss_out_names.append((args.input, sentence_split_file, args.output_prefix))
+        file_names = {
+            'partition': args.input,
+            'sentence_split': sentence_split_file,
+            'output_prefix': args.output_prefix}
+        in_ss_out_names.append(file_names)
    else:
        in_file_names = glob.glob(args.input)
@@ -358,7 +362,7 @@ def main():
            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
                                                             key, level)
            builders[key].merge_file_(full_partition_output_prefix)
-        builder[key].finalize(output_idx_files[key])
+        builders[key].finalize(output_idx_files[key])
 if __name__ == '__main__':