Added code to cleanup and fix code

47e7eb80 · Mostofa Patwary · 9c2eb971 · 47e7eb80 · 47e7eb80
Commit 47e7eb80 authored Mar 08, 2021 by Mostofa Patwary
Hide whitespace changes
Inline Side-by-side

Showing with 180 additions and 6 deletions

tools/openwebtext/cleanup_fix_dataset.py tools/openwebtext/cleanup_fix_dataset.py +172 -0

tools/openwebtext/filter_ngrams.py tools/openwebtext/filter_ngrams.py +8 -6

No files found.
--- a/tools/openwebtext/cleanup_fix_dataset.py
+++ b/tools/openwebtext/cleanup_fix_dataset.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Filter and clean documents:
+Capable to clean docs with less than 512 characters, less than
+256 characters and contains javascript, fix text and clean text
+"""
+import argparse
+from functools import partial
+import glob
+import ftfy
+import json
+from langdetect import detect
+import multiprocessing
+import os
+from pathlib import Path
+import re
+import time
+def process_doc(json_line, args):
+    # Read the line.
+    document = json.loads(json_line)
+    text = document['text']
+    output = {'remove_512': False, 'remove_256_javascript': False, \
+        'remove_512_non_english': False, 'ftfy_fix_text': False, \
+        'general_cleaning': False}
+    try:
+        # Reomove all docs with less than 512 characters
+        if "remove_512" in args.tasks:
+            if len(text) < 512:
+                output['remove_512'] = True
+                return output, text, document, True
+        # Remove docs if less than 256 character length and contains Javascript
+        if "remove_256_javascript" in args.tasks:
+            if len(text) < 256 and 'javascript' in text.lower():
+                output['remove_256_javascript'] = True
+                return output, text, document, True
+        # Remove docs < 512 and nonenglish
+        if "remove_512_non_english" in args.tasks:
+            if len(text) < 512 and detect(text) != 'en':
+                output['remove_512_non_english'] = True
+                return output, text, document, True
+        # Fix the text using ftfy, don't remove the text, hence return False
+        if "ftfy_fix_text" in args.tasks:
+            fixed_text = ftfy.fix_text(text)
+            output['ftfy_fix_text'] = True
+            return output, fixed_text, document, False
+        # Cleaning extra spaces and newlines
+        if "general_cleaning" in args.tasks:
+            cleaned_text = re.sub(r"  +|\b\n+ |\b\n+", " ", text)
+            output['general_cleaning'] = True
+            return output, cleaned_text, document, False
+    except Exception as e:
+        print('Error: *************************\n{}\ntext: {}'.format(e, \
+            text), flush=True)
+        return output, text, True
+    # don't remove
+    return output, text, document, False
+def process_set(args, input_file, output_f_cleaned, output_f_filtered):
+    print(' > working on {} ...'.format(input_file), flush=True)
+    num_docs = num_remove_512 = num_remove_java = num_remove_512_non_english \
+        = num_ftfy_fix_text = num_general_cleaning = 0
+    # Output file and counters.
+    output_cleaned = open(output_f_cleaned, 'wb')
+    output_filtered = open(output_f_filtered, 'wb')
+    start_time = time.time()
+    # Setup multi-processing.
+    num_workers = 40
+    fin = open(input_file, 'r', encoding='utf-8')
+    pool = multiprocessing.Pool(num_workers)
+    process_doc_partial = partial(process_doc, args=args)
+    processed_docs = pool.imap(process_doc_partial, fin, 500)
+    # Process documents.
+    for output, text, document, to_filter in processed_docs:
+        num_docs += 1
+        num_remove_512 += 1 if output['remove_512'] else 0
+        num_remove_java += 1 if output['remove_256_javascript'] else 0
+        num_remove_512_non_english += 1 if output['remove_512_non_english'] \
+            else 0
+        num_ftfy_fix_text += 1 if output['ftfy_fix_text'] else 0
+        num_general_cleaning +1 if output['general_cleaning'] else 0
+        document['text'] = text
+        myjson = json.dumps(document, ensure_ascii=False)
+        if to_filter:
+            output_filtered.write(myjson.encode('utf-8'))
+            output_filtered.write('\n'.encode('utf-8'))
+        else:
+            output_cleaned.write(myjson.encode('utf-8'))
+            output_cleaned.write('\n'.encode('utf-8'))
+        if num_docs % args.log_interval == 0:
+            print('    processed {:9d} documents in {:.2f} seconds ...'.format(
+                num_docs, time.time() - start_time), flush=True)
+    # Close the file.
+    output_cleaned.close()
+    output_filtered.close()
+    fin.close()
+    # Print stats.
+    print('  >> total docs: {} remove_512 {} remove_256_javascript {} '\
+        'remove_512_non_english {} ftfy_fix_text {} general_cleaning {}'.\
+        format(num_docs, num_remove_512, num_remove_java,\
+        num_remove_512_non_english, num_ftfy_fix_text, \
+        num_general_cleaning), flush=True)
+if __name__ == '__main__':
+    print('parsing the arguments ...')
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-files', nargs = '*', required=True, default=\
+                        None, help = 'Input json files that needs to be'\
+                        ' creaned')
+    parser.add_argument('--tasks', nargs = '*', required=True, default=None,\
+                        help = 'Tasks to perform on the input files')
+    parser.add_argument('--output-path', type=str, default=None,
+                       help='Directory where the output should go')
+    parser.add_argument('--log-interval', type=int, default=100,
+                       help='Log interval')
+    args = parser.parse_args()
+    print('cleanup dataset ...')
+    for input_file in args.input_files:
+        input_filename, input_filename_ext = os.path.splitext(Path(input_file)\
+            .name)
+        output_f_cleaned = os.path.join(args.output_path, input_filename + \
+            "_cleaned" + input_filename_ext)
+        output_f_filtered = os.path.join(args.output_path, input_filename + \
+            "_filtered" + input_filename_ext)
+        process_set(args, input_file, output_f_cleaned, output_f_filtered)
+    print('done :-)', flush=True)
--- a/tools/openwebtext/filter_ngrams.py
+++ b/tools/openwebtext/filter_ngrams.py
@@ -181,7 +181,7 @@ def free_ngram(line, args, key, ngrams, ngrams_freq_sorted):
        len(text_buf_ngram_free[0]) < len(myjson[key]):
        trimmed = 1
-    return text_buf_ngram_free, trimmed, local_ngram
+    return text_buf_ngram_free, trimmed, myjson, local_ngram
 # insert word sequence into dictionary
 def insert_dict(words, ngrams, pos):
@@ -312,7 +312,7 @@ def get_ngrams_below_threshold(args, ngrams, ngrams_below_threshold, \
    free_ngrams_abt = pool.imap(free_ngram_abt_partial, fin, 500)
    counter = 0
-    for _, _, local_ngram in free_ngrams_abt:
+    for _, _, _, local_ngram in free_ngrams_abt:
        counter += 1
        if counter % 1000 == 0:
            print(' [compute_stat]> processed {} documents in {:.2f} seconds ...'.
@@ -361,7 +361,7 @@ def clean_ngrams_below_threshold(args, ngrams_below_threshold, dedup_file, \
    out_f = open(args.output, 'wb')
-    for text_buf_ngram_free, trimmed, _ in free_ngrams_clean:
+    for text_buf_ngram_free, trimmed, myjson, _ in free_ngrams_clean:
        counter += 1
        try:
@@ -380,9 +380,11 @@ def clean_ngrams_below_threshold(args, ngrams_below_threshold, dedup_file, \
                for i in range(len(text_buf_ngram_free)):
                    split_id_string = id_prefix + '-{:010d}'.format(int(\
                        counter)) + '-{:010d}'.format(int(i))
-                    outjson = json.dumps({"text":text_buf_ngram_free[i],
+                    myjson[dedup_key] = text_buf_ngram_free[i]
-                        id_prefix+"_split_id":split_id_string},
+                    outjson = json.dumps(myjson, ensure_ascii=False)
-                        ensure_ascii=False)
+                    #outjson = json.dumps({"text":text_buf_ngram_free[i],
+                    #    id_prefix+"_split_id":split_id_string},
+                    #    ensure_ascii=False)
                    out_f.write(outjson.encode('utf-8'))
                    out_f.write('\n'.encode('utf-8'))