make_gpt2_sizes.py 982 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

import glob
import json
import os
import time
import sys

import numpy as np


if __name__ == '__main__':

    print('building the shard sizes ...')

    path = sys.argv[1]
    print('> reading numpy files from {}'.format(path))

    npy_files = glob.glob(path + '/*.npy')
    npy_files.sort()
    print('  found {} numpy files'.format(len(npy_files)))

    size_dict = {}
    counter = 0
    start_time = time.time()
    for filename in npy_files:
        data = np.load(filename, allow_pickle=True)
        size = np.hstack(data).size
        np_filename = os.path.basename(filename)
        size_dict[np_filename] = size
        counter += 1
        if counter % 10 == 0:
            print('   processed {} files in {:.2f} seconds'.format(
                counter, time.time() - start_time))

    output_filename = os.path.join(path, 'sizes.txt')
    with open(output_filename, 'w') as f:
        json.dump(size_dict, f)
    print('> wrote sizes to {}'.format(output_filename))