add master files

09915adf · cardy20 · fd43d570 · 09915adf · 09915adf · 09915adf
Commit 09915adf authored May 29, 2023 by cardy20
3 changed files
--- a/scripts/clean_training_data/compress_and_package.py
+++ b/scripts/clean_training_data/compress_and_package.py
+import glob
+import argparse
+import os
+import subprocess
+import shutil
+
+from tqdm import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+
+logger = logging.getLogger(__name__)
+
+
+def process_task(
+    working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm
+):
+    command = f"zstd {bucket_file_path}"
+    logger.info(command)
+    subprocess.call(command, shell=True)
+
+    compressed_file = bucket_file_path + ".zst"
+    if output_directory:
+        shutil.move(compressed_file, output_directory)
+
+    os.remove(bucket_file_path)
+    global_tqdm.update()
+
+
+def compress_and_move(working_directory, output_directory, process_count):
+    os.makedirs(output_directory, exist_ok=True)
+    original_info_file_path = os.path.join(working_directory, "info.json")
+    assert os.path.exists(original_info_file_path)
+
+    tasks = []
+    bucket_file_paths = glob.glob(
+        os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
+    )
+    for bucket_file_path in bucket_file_paths:
+        task = (process_task, (working_directory, output_directory, bucket_file_path))
+        tasks.append(task)
+
+    pool = TqdmMultiProcessPool(process_count)
+
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
+    global_progress = tqdm(
+        total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
+    )
+    _ = pool.map(global_progress, tasks, on_error, on_done)
+
+    shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))
+
+
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
+parser.add_argument("-dir", "--working_directory", required=True)
+parser.add_argument("-output", "--output_directory", required=True)
+parser.add_argument("-procs", "--process_count", type=int, default=8)
+
+if __name__ == "__main__":
+    version = 1.00
+    print(f"Running version {version}")
+
+    logfile_path = "compress_and_package.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    compress_and_move(args.working_directory, args.output_directory, args.process_count)
--- a/scripts/clean_training_data/investigate_pile.py
+++ b/scripts/clean_training_data/investigate_pile.py
+from lm_eval.decontamination.archiver import Reader
+import os
+import json
+from functools import reduce
+import glob
+import tqdm
+
+from tqdm_multiprocess import TqdmMultiProcessPool
+
+
+def get_file_stats(file_path, tqdm_func, global_tqdm):
+    reader = Reader()
+    total_documents = 0
+    total_size = 0
+    update_frequency = 10000
+    current_file_position = 0
+
+    with tqdm_func(
+        total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1
+    ) as progress:
+        for document in reader.read(file_path, get_meta=True):
+            total_size += len(document)
+            total_documents += 1
+
+            if total_documents % update_frequency == 0:
+                new_file_pos = reader.fh.tell()
+                bytes_read = new_file_pos - current_file_position
+                current_file_position = new_file_pos
+                progress.update(bytes_read)
+                global_tqdm.update(bytes_read)
+
+    return (total_documents, total_size)
+
+
+def get_files_zst():
+    directory = "pile"
+    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
+    print(files)
+    return files
+
+def get_files():
+    """ jsonl files in directory """
+    directory = "pile"
+    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl"))))
+    print(files)
+    return files
+
+
+def get_stats():
+    files = get_files()
+    total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))
+
+    pool = TqdmMultiProcessPool(4)
+    global_tqdm = tqdm.tqdm(
+        total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
+
+    # Generate minhashes with pool
+    tasks = [(get_file_stats, (file,)) for file in files]
+
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
+    results = pool.map(global_tqdm, tasks, on_error, on_done)
+
+    total_documents, total_size = reduce(
+        lambda x, y: (x[0] + y[0], x[1] + y[1]), results
+    )
+
+    start_offsets = []
+    current_offset = 0
+    for file_document_count, _ in results:
+        start_offsets.append(current_offset)
+        current_offset += file_document_count
+
+    return (total_documents, total_size, start_offsets)
+
+
+if __name__ == "__main__":
+    version = 1.01
+    print(f"Running version {version}")
+
+    stats_file_path = "pile_statistics.json"
+    if os.path.exists(stats_file_path):
+        stats = json.load(open(stats_file_path, "r"))
+    else:
+        document_count, total_document_size_chars, start_offsets = get_stats()
+        stats = {
+            "Data": "Pile statistics",
+            "Document Count": document_count,
+            "Total Pile Characters": total_document_size_chars,
+            "File Start Offsets": start_offsets,
+        }
+        json.dump(stats, open(stats_file_path, "w"), indent=4)
+
+    print(f"document_count: {stats['Document Count']}")
+    print(f"total_chars: {stats['Total Pile Characters']}")
+    print(f"start_offsets: {stats['File Start Offsets']}")
\ No newline at end of file
--- a/scripts/clean_training_data/sort_13_gram_buckets.py
+++ b/scripts/clean_training_data/sort_13_gram_buckets.py
@@ -34,6 +34,7 @@ def sort_13_gram_buckets(working_directory):
    for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
        bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
        done_file = os.path.join(working_directory, f"ngram_bucket_sorting_{bucket_id}.done")
+        
        if os.path.exists(done_file):
            logger.info(f"bucket {bucket_id} already processed, skipping")
            return