__init__.py 5.8 KB
Newer Older
Raul Puri's avatar
Raul Puri committed
1
# coding=utf-8
Mohammad's avatar
Mohammad committed
2
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
Raul Puri's avatar
Raul Puri committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""utils for creating datasets"""
import os
import math

19
20
import torch

Raul Puri's avatar
Raul Puri committed
21
from .samplers import DistributedBatchSampler
22
from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
Raul Puri's avatar
Raul Puri committed
23
from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
24
from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
Raul Puri's avatar
Raul Puri committed
25
26
27
28
29
30
from . import corpora

TRAIN_DATA = 0
VAL_DATA = 1
TEST_DATA = 2

Neel Kant's avatar
Neel Kant committed
31

Raul Puri's avatar
Raul Puri committed
32
33
34
35
def should_split(split):
    """
    given split proportions checks if should split
    Examples:
Neel Kant's avatar
Neel Kant committed
36
    >>> should_split([10,0,0])
Raul Puri's avatar
Raul Puri committed
37
38
39
40
    False
    >>> should_split([1,.1,.2])
    True
    """
Neel Kant's avatar
Neel Kant committed
41
42
    return max(split) / sum(split) != 1.

Raul Puri's avatar
Raul Puri committed
43
44
45
46
47

def get_ext(path):
    """gets path extension"""
    return os.path.splitext(path)[1]

Neel Kant's avatar
Neel Kant committed
48

Raul Puri's avatar
Raul Puri committed
49
50
51
52
53
def get_dataset(path, **kwargs):
    """gets dataset object based on keyword args and file at `path`"""
    if supported_corpus(path):
        return corpora.NAMED_CORPORA[path](**kwargs)
    ext = get_ext(path)
Raul Puri's avatar
Raul Puri committed
54
    if '.json' in ext:
Raul Puri's avatar
Raul Puri committed
55
56
57
58
        text = json_dataset(path, **kwargs)
    elif ext in ['.csv', '.tsv']:
        text = csv_dataset(path, **kwargs)
    else:
Neel Kant's avatar
Neel Kant committed
59
        raise NotImplementedError('data file type %s is not supported' % (ext))
Raul Puri's avatar
Raul Puri committed
60
61
    return text

Neel Kant's avatar
Neel Kant committed
62

Raul Puri's avatar
Raul Puri committed
63
64
65
66
def supported_corpus(corpus_name):
    """checks if corpus name is defined in `corpora.py`"""
    return corpus_name in corpora.NAMED_CORPORA

Neel Kant's avatar
Neel Kant committed
67

Raul Puri's avatar
Raul Puri committed
68
def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
Neel Kant's avatar
Neel Kant committed
69
70
71
                 delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
                 tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
                 model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
72
                 parallel_group=None, **kwargs):
Raul Puri's avatar
Raul Puri committed
73
74
75
76
77
78
    """function to create datasets+tokenizers for common options"""
    if isinstance(process_fn, str):
        process_fn = eval(process_fn)
    if non_binary_cols is not None:
        # multilabel dataset support (only for csvs)
        label_key = non_binary_cols
Neel Kant's avatar
Neel Kant committed
79

Raul Puri's avatar
Raul Puri committed
80
81
82
83
84
85
86
87
    def get_dataset_from_path(path_):
        if lazy:
            # get lazily loaded dataset
            named_corpora = False
            if supported_corpus(path_):
                named_corpora = True
                name = path_
                path_ = corpora.NAMED_CORPORA[path_].PATH
88
            if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
Raul Puri's avatar
Raul Puri committed
89
90
                # create cached version of dataset for lazy loading if it doesn't exist
                text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
Neel Kant's avatar
Neel Kant committed
91
                                   delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
Raul Puri's avatar
Raul Puri committed
92
                make_lazy(path_, text.X, data_type='data')
93
94
95
96
97
98
99
100
            # This should be a barrier but nccl barrier assumes
            # device_index=rank which is not the case for model
            # parallel case
            counts = torch.cuda.LongTensor([1])
            torch.distributed.all_reduce(counts, group=parallel_group)
            assert counts[0].item() == torch.distributed.get_world_size(
                group=parallel_group)

Raul Puri's avatar
Raul Puri committed
101
102
103
104
            text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
        else:
            # get dataset
            text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
Neel Kant's avatar
Neel Kant committed
105
                               delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
Raul Puri's avatar
Raul Puri committed
106
107
108
109
110
111
112
113
114
115
116
        return text
    # get one or multiple datasets and concatenate
    if isinstance(path, str):
        path = [path]
    datasets = [get_dataset_from_path(p) for p in path]
    if len(datasets) == 1:
        ds = datasets[0]
    else:
        ds = ConcatDataset(datasets)
    # make tokenizer for dataset
    if tokenizer is None:
Neel Kant's avatar
Neel Kant committed
117
118
        tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
                                   pad_token, character_converage, **kwargs)
Raul Puri's avatar
Raul Puri committed
119
120
121
122
123
124
125
126

    ds_type = ''
    if 'ds_type' in kwargs:
        ds_type = kwargs['ds_type']
    ds.SetTokenizer(tokenizer)
    # Split dataset into train/val/test (and wrap bert dataset)
    if should_split(split):
        ds = split_ds(ds, split)
127
        if 'bert' in ds_type.lower():
Raul Puri's avatar
Raul Puri committed
128
            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
129
            dstype = bert_sentencepair_dataset
Neel Kant's avatar
Neel Kant committed
130
131
            ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
                  if d is not None else None for d in ds]
132
133
        elif ds_type.lower() == 'gpt2':
            ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
Raul Puri's avatar
Raul Puri committed
134
    else:
135
        if 'bert' in ds_type.lower():
Raul Puri's avatar
Raul Puri committed
136
            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
137
138
            dstype = bert_sentencepair_dataset
            ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
139
140
        elif ds_type.lower() == 'gpt2':
            ds = GPT2Dataset(ds, max_seq_len=seq_length)
Raul Puri's avatar
Raul Puri committed
141
    return ds, tokenizer