investigate_pile.py 2.7 KB
Newer Older
1
import glob
researcher2's avatar
researcher2 committed
2
import json
3
import os
researcher2's avatar
researcher2 committed
4
5
from functools import reduce

6
import tqdm
researcher2's avatar
researcher2 committed
7
8
from tqdm_multiprocess import TqdmMultiProcessPool

9
10
from lm_eval.decontamination.archiver import Reader

Fabrizio Milo's avatar
Fabrizio Milo committed
11

researcher2's avatar
researcher2 committed
12
13
14
15
16
17
18
def get_file_stats(file_path, tqdm_func, global_tqdm):
    reader = Reader()
    total_documents = 0
    total_size = 0
    update_frequency = 10000
    current_file_position = 0

Fabrizio Milo's avatar
Fabrizio Milo committed
19
20
21
    with tqdm_func(
        total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1
    ) as progress:
researcher2's avatar
researcher2 committed
22
23
24
25
26
        for document in reader.read(file_path, get_meta=True):
            total_size += len(document)
            total_documents += 1

            if total_documents % update_frequency == 0:
Fabrizio Milo's avatar
Fabrizio Milo committed
27
                new_file_pos = reader.fh.tell()
researcher2's avatar
researcher2 committed
28
29
30
31
32
33
34
                bytes_read = new_file_pos - current_file_position
                current_file_position = new_file_pos
                progress.update(bytes_read)
                global_tqdm.update(bytes_read)

    return (total_documents, total_size)

Fabrizio Milo's avatar
Fabrizio Milo committed
35

researcher2's avatar
researcher2 committed
36
37
38
39
40
41
def get_files():
    directory = "pile"
    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
    print(files)
    return files

Fabrizio Milo's avatar
Fabrizio Milo committed
42

researcher2's avatar
researcher2 committed
43
44
45
def get_stats():
    files = get_files()
    total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))
Fabrizio Milo's avatar
Fabrizio Milo committed
46

researcher2's avatar
researcher2 committed
47
    pool = TqdmMultiProcessPool(4)
Fabrizio Milo's avatar
Fabrizio Milo committed
48
49
50
    global_tqdm = tqdm.tqdm(
        total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1
    )
researcher2's avatar
researcher2 committed
51
52
53
54

    # Generate minhashes with pool
    tasks = [(get_file_stats, (file,)) for file in files]

55
56
57
58
59
60
    def on_done(_):
        return None

    def on_error(_):
        return None

researcher2's avatar
researcher2 committed
61
62
    results = pool.map(global_tqdm, tasks, on_error, on_done)

Fabrizio Milo's avatar
Fabrizio Milo committed
63
64
65
    total_documents, total_size = reduce(
        lambda x, y: (x[0] + y[0], x[1] + y[1]), results
    )
researcher2's avatar
researcher2 committed
66
67
68
69
70
71
72
73
74

    start_offsets = []
    current_offset = 0
    for file_document_count, _ in results:
        start_offsets.append(current_offset)
        current_offset += file_document_count

    return (total_documents, total_size, start_offsets)

Fabrizio Milo's avatar
Fabrizio Milo committed
75
76

if __name__ == "__main__":
researcher2's avatar
researcher2 committed
77
78
79
80
81
82
83
84
    version = 1.01
    print(f"Running version {version}")

    stats_file_path = "pile_statistics.json"
    if os.path.exists(stats_file_path):
        stats = json.load(open(stats_file_path, "r"))
    else:
        document_count, total_document_size_chars, start_offsets = get_stats()
Fabrizio Milo's avatar
Fabrizio Milo committed
85
86
87
88
89
90
91
        stats = {
            "Data": "Pile statistics",
            "Document Count": document_count,
            "Total Pile Characters": total_document_size_chars,
            "File Start Offsets": start_offsets,
        }
        json.dump(stats, open(stats_file_path, "w"), indent=4)
researcher2's avatar
researcher2 committed
92
93
94
95

    print(f"document_count: {stats['Document Count']}")
    print(f"total_chars: {stats['Total Pile Characters']}")
    print(f"start_offsets: {stats['File Start Offsets']}")