test_trainer.py 9.06 KB
Newer Older
Julien Chaumond's avatar
Julien Chaumond committed
1
2
3
import unittest

from transformers import AutoTokenizer, TrainingArguments, is_torch_available
4
from transformers.testing_utils import require_torch
Julien Chaumond's avatar
Julien Chaumond committed
5
6
7
8


if is_torch_available():
    import torch
9
10
    from torch.utils.data import IterableDataset

Julien Chaumond's avatar
Julien Chaumond committed
11
12
13
    from transformers import (
        AutoModelForSequenceClassification,
        DataCollatorForLanguageModeling,
14
        DataCollatorForPermutationLanguageModeling,
Julien Chaumond's avatar
Julien Chaumond committed
15
16
        GlueDataset,
        GlueDataTrainingArguments,
17
        LineByLineTextDataset,
Julien Chaumond's avatar
Julien Chaumond committed
18
        TextDataset,
19
20
        Trainer,
        default_data_collator,
Julien Chaumond's avatar
Julien Chaumond committed
21
22
23
24
25
26
27
28
    )


PATH_SAMPLE_TEXT = "./tests/fixtures/sample_text.txt"


@require_torch
class DataCollatorIntegrationTest(unittest.TestCase):
29
    def test_default_with_dict(self):
Sylvain Gugger's avatar
Sylvain Gugger committed
30
        features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
31
32
33
34
35
36
37
38
39
40
41
42
43
        batch = default_data_collator(features)
        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))

        # With label_ids
        features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue(batch["labels"].equal(torch.tensor([[0, 1, 2]] * 8)))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))

        # Features can already be tensors
Sylvain Gugger's avatar
Sylvain Gugger committed
44
        features = [{"label": i, "inputs": torch.randint(10, [10])} for i in range(8)]
45
46
47
48
49
        batch = default_data_collator(features)
        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))

50
51
52
53
54
55
56
57
        # Labels can already be tensors
        features = [{"label": torch.tensor(i), "inputs": torch.randint(10, [10])} for i in range(8)]
        batch = default_data_collator(features)
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))

Sylvain Gugger's avatar
Sylvain Gugger committed
58
59
60
61
62
63
64
65
66
67
68
69
    def test_default_with_no_labels(self):
        features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue("labels" not in batch)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))

        # With label_ids
        features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue("labels" not in batch)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))

Julien Chaumond's avatar
Julien Chaumond committed
70
71
72
73
    def test_default_classification(self):
        MODEL_ID = "bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        data_args = GlueDataTrainingArguments(
74
            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
Julien Chaumond's avatar
Julien Chaumond committed
75
        )
76
        dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
77
78
        data_collator = default_data_collator
        batch = data_collator(dataset.features)
Julien Chaumond's avatar
Julien Chaumond committed
79
80
81
82
83
84
        self.assertEqual(batch["labels"].dtype, torch.long)

    def test_default_regression(self):
        MODEL_ID = "distilroberta-base"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        data_args = GlueDataTrainingArguments(
85
            task_name="sts-b", data_dir="./tests/fixtures/tests_samples/STS-B", overwrite_cache=True
Julien Chaumond's avatar
Julien Chaumond committed
86
        )
87
        dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
88
89
        data_collator = default_data_collator
        batch = data_collator(dataset.features)
Julien Chaumond's avatar
Julien Chaumond committed
90
91
92
93
94
95
96
97
98
99
100
        self.assertEqual(batch["labels"].dtype, torch.float)

    def test_lm_tokenizer_without_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        # ^ causal lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing on gpt2:
101
            data_collator(examples)
Julien Chaumond's avatar
Julien Chaumond committed
102
103
104

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
105
        batch = data_collator(examples)
Julien Chaumond's avatar
Julien Chaumond committed
106
107
108
109
110
111
112
113
114
115
116
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))

    def test_lm_tokenizer_with_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        # ^ masked lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
117
        batch = data_collator(examples)
Julien Chaumond's avatar
Julien Chaumond committed
118
119
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107)))
Sylvain Gugger's avatar
Sylvain Gugger committed
120
        self.assertEqual(batch["labels"].shape, torch.Size((31, 107)))
Julien Chaumond's avatar
Julien Chaumond committed
121
122
123

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
124
        batch = data_collator(examples)
Julien Chaumond's avatar
Julien Chaumond committed
125
126
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
Sylvain Gugger's avatar
Sylvain Gugger committed
127
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))
Julien Chaumond's avatar
Julien Chaumond committed
128

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
    def test_plm(self):
        tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
        data_collator = DataCollatorForPermutationLanguageModeling(tokenizer)
        # ^ permutation lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 112)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((31, 112, 112)))
        self.assertEqual(batch["target_mapping"].shape, torch.Size((31, 112, 112)))
        self.assertEqual(batch["labels"].shape, torch.Size((31, 112)))

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 512, 512)))
        self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 512, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))

        example = [torch.randint(5, [5])]
        with self.assertRaises(ValueError):
            # Expect error due to odd sequence length
            data_collator(example)

Julien Chaumond's avatar
Julien Chaumond committed
157

158
159
160
161
162
163
164
165
166
167
168
169
170
171
if is_torch_available():

    class SampleIterableDataset(IterableDataset):
        def __init__(self, file_path):
            self.file_path = file_path

        def parse_file(self):
            f = open(self.file_path, "r")
            return f.readlines()

        def __iter__(self):
            return iter(self.parse_file())


Julien Chaumond's avatar
Julien Chaumond committed
172
173
174
175
176
177
178
@require_torch
class TrainerIntegrationTest(unittest.TestCase):
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
        data_args = GlueDataTrainingArguments(
179
            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
Julien Chaumond's avatar
Julien Chaumond committed
180
        )
181
        eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
Julien Chaumond's avatar
Julien Chaumond committed
182
183
184
185

        training_args = TrainingArguments(output_dir="./examples", no_cuda=True)
        trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
        result = trainer.evaluate()
186
        self.assertLess(result["eval_loss"], 0.2)
Julien Chaumond's avatar
Julien Chaumond committed
187
188
189
190
191
192
193
194

    def test_trainer_eval_lm(self):
        MODEL_ID = "distilroberta-base"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        dataset = LineByLineTextDataset(
            tokenizer=tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=tokenizer.max_len_single_sentence,
        )
        self.assertEqual(len(dataset), 31)
195
196
197
198
199
200
201
202
203

    def test_trainer_iterable_dataset(self):
        MODEL_ID = "sshleifer/tiny-distilbert-base-cased"
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
        train_dataset = SampleIterableDataset(PATH_SAMPLE_TEXT)
        training_args = TrainingArguments(output_dir="./examples", no_cuda=True)
        trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
        loader = trainer.get_train_dataloader()
        self.assertIsInstance(loader, torch.utils.data.DataLoader)