test_indexed_dataset.py 4.52 KB
Newer Older
1
2
3
4
# This file isn't really a formal automated test, it's just a place to
# put some code used during development and manual testing of
# indexed_dataset.

Neel Kant's avatar
Neel Kant committed
5
6
from megatron.data import indexed_dataset
from megatron.tokenizer import build_tokenizer
7
8
9
10
11
12
13
14
15
16
17
import argparse
import os
import sys

import torch

script_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(script_dir, "../../../"))


def test_indexed_dataset(args):
18
    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
19
    tokenizer = build_tokenizer(args)
20
21
22
23
24
25
    print(len(ds.doc_idx))
    print(len(ds))
    print(ds.doc_idx[-1])
    if ds.supports_prefetch:
        # just prefetch the whole thing in test (so assume it is small)
        ds.prefetch(range(len(ds)))
Neel Kant's avatar
Neel Kant committed
26
27
    if args.count > len(ds.doc_idx) - 1:
        args.count = len(ds.doc_idx) - 1
28
29

    for i in range(args.count):
30
        start = ds.doc_idx[i]
Neel Kant's avatar
Neel Kant committed
31
        end = ds.doc_idx[i + 1]
32
        ids = ds[start:end]
33
34
        print(f"Document {i}:")
        print("--------------")
35
        for s in ids:
36
            assert len(s) > 0
37
            l = s.data.tolist()
38
39
40
41
            text = tokenizer.detokenize(l)
            print(text)
            print("---")

Neel Kant's avatar
Neel Kant committed
42

43
44
45
46
47
48
49
def test_indexed_dataset_get(args):
    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
    tokenizer = build_tokenizer(args)
    size = ds.sizes[0]
    print(f"size: {size}")
    full = ds.get(0)
    print(full)
Neel Kant's avatar
Neel Kant committed
50
    # print(tokenizer.detokenize(full.data.tolist()))
51
    print("---")
Neel Kant's avatar
Neel Kant committed
52
    end = ds.get(0, offset=size - 10)
53
    print(end)
Neel Kant's avatar
Neel Kant committed
54
    # print(tokenizer.detokenize(end.data.tolist()))
55
56
57

    start = ds.get(0, length=10)
    print(start)
Neel Kant's avatar
Neel Kant committed
58
    # print(tokenizer.detokenize(start.data.tolist()))
59

60
61
    part = ds.get(0, offset=2, length=8)
    print(part)
Neel Kant's avatar
Neel Kant committed
62
    # print(tokenizer.detokenize(part.data.tolist()))
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

# def test_albert_dataset(args):
#     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
#     # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
#     # ds = AlbertDataset(idataset, tokenizer)
#     ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
#                                   args.epochs, args.max_num_samples,
#                                   args.masked_lm_prob, args.seq_length,
#                                   args.short_seq_prob, args.seed)
#     truncated = 0
#     total = 0
#     for i, s in enumerate(ds):
#         ids = s['text']
#         tokens = ds.tokenizer.convert_ids_to_tokens(ids)
#         print(tokens)
#         if i >= args.count-1:
#             exit()
80

Neel Kant's avatar
Neel Kant committed
81

82
83
84
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, help='prefix to data files')
85
86
    parser.add_argument('--dataset-impl', type=str, default='infer',
                        choices=['lazy', 'cached', 'mmap', 'infer'])
87
88
89
90
91
92
93
94
95
96
97
98
99
    parser.add_argument('--count', type=int, default=10,
                        help='Number of samples/documents to print')

    group = parser.add_argument_group(title='tokenizer')
    group.add_argument('--tokenizer-type', type=str, required=True,
                       choices=['BertWordPieceLowerCase',
                                'GPT2BPETokenizer'],
                       help='What type of tokenizer to use.')
    group.add_argument('--vocab-file', type=str, default=None,
                       help='Path to the vocab file')
    group.add_argument('--merge-file', type=str, default=None,
                       help='Path to the BPE merge file (if necessary).')

100
101
102
103
104
105
106
107
108
109
110
111
    parser.add_argument('--epochs', type=int, default=5,
                        help='Number of epochs to plan for')
    parser.add_argument('--max-num-samples', type=int, default=None,
                        help='Maximum number of samples to plan for')
    parser.add_argument('--masked-lm-prob', type=float, default=0.15,
                        help='probability of masking tokens')
    parser.add_argument('--seq-length', type=int, default=512,
                        help='maximum sequence length')
    parser.add_argument('--short-seq-prob', type=float, default=0.1,
                        help='probability of creating a short sequence')
    parser.add_argument('--seed', type=int, default=1234,
                        help='random seed')
112
    args = parser.parse_args()
113
114
    args.rank = 0
    args.make_vocab_size_divisible_by = 128
115
    args.tensor_model_parallel_size = 1
116

117
118
119
    if args.dataset_impl == "infer":
        args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)

120
121
#    test_albert_dataset(args)
    test_indexed_dataset_get(args)
122

Neel Kant's avatar
Neel Kant committed
123

124
125
if __name__ == "__main__":
    main()