test_modeling_common.py 28 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

thomwolf's avatar
thomwolf committed
16

17
import copy
Aymeric Augustin's avatar
Aymeric Augustin committed
18
19
import json
import logging
20
import os.path
Aymeric Augustin's avatar
Aymeric Augustin committed
21
import random
thomwolf's avatar
thomwolf committed
22
import shutil
Aymeric Augustin's avatar
Aymeric Augustin committed
23
import sys
24
import tempfile
thomwolf's avatar
thomwolf committed
25
import unittest
Aymeric Augustin's avatar
Aymeric Augustin committed
26
import uuid
thomwolf's avatar
thomwolf committed
27

28
from transformers import is_torch_available
29

30
from .utils import require_torch, slow, torch_device
31

Aymeric Augustin's avatar
Aymeric Augustin committed
32

33
if is_torch_available():
thomwolf's avatar
thomwolf committed
34
    import torch
35
    import numpy as np
thomwolf's avatar
thomwolf committed
36

37
38
39
40
41
42
43
44
    from transformers import (
        AdaptiveEmbedding,
        PretrainedConfig,
        PreTrainedModel,
        BertModel,
        BertConfig,
        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    )
thomwolf's avatar
thomwolf committed
45

46
47
48
49
if sys.version_info[0] == 2:

    class TemporaryDirectory(object):
        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
50

51
52
53
        def __enter__(self):
            self.name = tempfile.mkdtemp()
            return self.name
54

55
56
        def __exit__(self, exc_type, exc_value, traceback):
            shutil.rmtree(self.name)
57
58


59
60
61
else:
    TemporaryDirectory = tempfile.TemporaryDirectory
    unicode = str
thomwolf's avatar
thomwolf committed
62

63

64
65
66
def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
67
        if "_range" in key or "_std" in key or "initializer_factor" in key:
68
69
70
            setattr(configs_no_init, key, 0.0)
    return configs_no_init

thomwolf's avatar
thomwolf committed
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
@require_torch
class ModelTesterMixin:

    model_tester = None
    all_model_classes = ()
    test_torchscript = True
    test_pruning = True
    test_resize_embeddings = True
    test_head_masking = True
    is_encoder_decoder = False

    def test_save_load(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs_dict)
            out_2 = outputs[0].numpy()
            out_2[np.isnan(out_2)] = 0
94

95
96
97
            with TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)
                model = model_class.from_pretrained(tmpdirname)
98
                model.to(torch_device)
99
                with torch.no_grad():
100
                    after_outputs = model(**inputs_dict)
thomwolf's avatar
thomwolf committed
101

102
103
104
                # Make sure we don't have nans
                out_1 = after_outputs[0].cpu().numpy()
                out_1[np.isnan(out_1)] = 0
thomwolf's avatar
thomwolf committed
105
106
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
107

108
109
110
111
112
113
114
115
116
117
118
119
120
    def test_initialization(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        configs_no_init = _config_zero_init(config)
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            for name, param in model.named_parameters():
                if param.requires_grad:
                    self.assertIn(
                        param.data.mean().item(),
                        [0.0, 1.0],
                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
                    )
thomwolf's avatar
thomwolf committed
121

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    def test_determinism(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                first = model(**inputs_dict)[0]
                second = model(**inputs_dict)[0]
            out_1 = first.cpu().numpy()
            out_2 = second.cpu().numpy()
            out_1 = out_1[~np.isnan(out_1)]
            out_2 = out_2[~np.isnan(out_2)]
            max_diff = np.amax(np.abs(out_1 - out_2))
            self.assertLessEqual(max_diff, 1e-5)

    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        decoder_seq_length = (
            self.model_tester.decoder_seq_length
            if hasattr(self.model_tester, "decoder_seq_length")
            else self.model_tester.seq_length
        )
        encoder_seq_length = (
            self.model_tester.encoder_seq_length
            if hasattr(self.model_tester, "encoder_seq_length")
            else self.model_tester.seq_length
        )
        decoder_key_length = (
            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
        )
        encoder_key_length = (
            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
        )

        for model_class in self.all_model_classes:
            config.output_attentions = True
            config.output_hidden_states = False
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, False)
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(attentions[0].shape[-3:]),
                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
174
            )
175
            out_len = len(outputs)
thomwolf's avatar
thomwolf committed
176

177
178
179
            if self.is_encoder_decoder:
                self.assertEqual(out_len % 2, 0)
                decoder_attentions = outputs[(out_len // 2) - 1]
thomwolf's avatar
thomwolf committed
180
181
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
182
                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
thomwolf's avatar
thomwolf committed
183
                self.assertListEqual(
184
185
                    list(decoder_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
186
                )
thomwolf's avatar
thomwolf committed
187

188
            # Check attention is always last and order is fine
thomwolf's avatar
thomwolf committed
189
190
            config.output_attentions = True
            config.output_hidden_states = True
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs_dict)
            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)

            self_attentions = outputs[-1]
            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(self_attentions[0].shape[-3:]),
                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
            )
thomwolf's avatar
thomwolf committed
206

207
208
    def test_torchscript(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
209

210
        self._create_and_check_torchscript(config, inputs_dict)
thomwolf's avatar
thomwolf committed
211

212
213
    def test_torchscript_output_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
214

215
216
        config.output_attentions = True
        self._create_and_check_torchscript(config, inputs_dict)
thomwolf's avatar
thomwolf committed
217

218
219
    def test_torchscript_output_hidden_state(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
220

221
222
        config.output_hidden_states = True
        self._create_and_check_torchscript(config, inputs_dict)
thomwolf's avatar
thomwolf committed
223

224
225
226
    def _create_and_check_torchscript(self, config, inputs_dict):
        if not self.test_torchscript:
            return
227

228
229
230
231
232
233
234
        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
        configs_no_init.torchscript = True
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            model.to(torch_device)
            model.eval()
            inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
thomwolf's avatar
thomwolf committed
235

236
237
238
239
            try:
                traced_gpt2 = torch.jit.trace(model, inputs)
            except RuntimeError:
                self.fail("Couldn't trace module.")
thomwolf's avatar
thomwolf committed
240

241
242
            with TemporaryDirectory() as tmp_dir_name:
                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
thomwolf's avatar
thomwolf committed
243

244
245
246
247
                try:
                    torch.jit.save(traced_gpt2, pt_file_name)
                except Exception:
                    self.fail("Couldn't save module.")
thomwolf's avatar
thomwolf committed
248

249
250
251
252
                try:
                    loaded_model = torch.jit.load(pt_file_name)
                except Exception:
                    self.fail("Couldn't load module.")
LysandreJik's avatar
LysandreJik committed
253

254
255
            model.to(torch_device)
            model.eval()
thomwolf's avatar
thomwolf committed
256

257
258
            loaded_model.to(torch_device)
            loaded_model.eval()
thomwolf's avatar
thomwolf committed
259

260
261
            model_params = model.parameters()
            loaded_model_params = loaded_model.parameters()
thomwolf's avatar
thomwolf committed
262

263
264
265
266
            models_equal = True
            for p1, p2 in zip(model_params, loaded_model_params):
                if p1.data.ne(p2.data).sum() > 0:
                    models_equal = False
thomwolf's avatar
thomwolf committed
267

268
            self.assertTrue(models_equal)
thomwolf's avatar
thomwolf committed
269

270
271
272
    def test_headmasking(self):
        if not self.test_head_masking:
            return
273

274
275
276
        global_rng.seed(42)
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        global_rng.seed()
LysandreJik's avatar
LysandreJik committed
277

278
279
280
281
282
283
284
        config.output_attentions = True
        config.output_hidden_states = True
        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            model.to(torch_device)
            model.eval()
LysandreJik's avatar
LysandreJik committed
285

286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
            # Prepare head_mask
            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
            head_mask = torch.ones(
                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
            )
            head_mask[0, 0] = 0
            head_mask[-1, :-1] = 0
            head_mask.requires_grad_(requires_grad=True)
            inputs = inputs_dict.copy()
            inputs["head_mask"] = head_mask

            outputs = model(**inputs)

            # Test that we can get a gradient back for importance score computation
            output = sum(t.sum() for t in outputs[0])
            output = output.sum()
            output.backward()
            multihead_outputs = head_mask.grad

            attentions = outputs[-1]
            hidden_states = outputs[-2]

            # Remove Nan
            for t in attentions:
                self.assertLess(
                    torch.sum(torch.isnan(t)), t.numel() / 4
                )  # Check we don't have more than 25% nans (arbitrary)
            attentions = [
                t.masked_fill(torch.isnan(t), 0.0) for t in attentions
            ]  # remove them (the test is less complete)

            self.assertIsNotNone(multihead_outputs)
            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
            self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
            self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
            self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
            self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
            self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)

    def test_head_pruning(self):
        if not self.test_pruning:
            return

        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
331

332
333
            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
334

335
336
337
338
339
340
341
342
343
            config.output_attentions = True
            config.output_hidden_states = False
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
            model.prune_heads(heads_to_prune)
            with torch.no_grad():
                outputs = model(**inputs_dict)
344

345
            attentions = outputs[-1]
346

347
348
349
            self.assertEqual(attentions[0].shape[-3], 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
LysandreJik's avatar
LysandreJik committed
350

351
352
353
    def test_head_pruning_save_load_from_pretrained(self):
        if not self.test_pruning:
            return
LysandreJik's avatar
LysandreJik committed
354

355
356
357
358
359
        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
360

361
362
363
364
365
366
367
            config.output_attentions = True
            config.output_hidden_states = False
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
            model.prune_heads(heads_to_prune)
368

369
370
371
            with TemporaryDirectory() as temp_dir_name:
                model.save_pretrained(temp_dir_name)
                model = model_class.from_pretrained(temp_dir_name)
372
                model.to(torch_device)
373

374
375
376
377
378
379
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
            self.assertEqual(attentions[0].shape[-3], 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
380

381
382
383
    def test_head_pruning_save_load_from_config_init(self):
        if not self.test_pruning:
            return
384

385
386
        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
387

388
389
            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
390

391
392
            config.output_attentions = True
            config.output_hidden_states = False
393

394
395
            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
            config.pruned_heads = heads_to_prune
396

397
398
399
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
400

401
402
403
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
404

405
406
407
            self.assertEqual(attentions[0].shape[-3], 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
408

409
410
411
    def test_head_pruning_integration(self):
        if not self.test_pruning:
            return
412

413
414
        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
415

416
417
            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
418

419
420
            config.output_attentions = True
            config.output_hidden_states = False
421

422
423
            heads_to_prune = {0: [0], 1: [1, 2]}
            config.pruned_heads = heads_to_prune
424

425
426
427
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
428

429
430
431
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
432

433
434
435
436
            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
thomwolf's avatar
thomwolf committed
437

438
439
440
            with TemporaryDirectory() as temp_dir_name:
                model.save_pretrained(temp_dir_name)
                model = model_class.from_pretrained(temp_dir_name)
441
                model.to(torch_device)
thomwolf's avatar
thomwolf committed
442

443
444
445
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
LysandreJik's avatar
LysandreJik committed
446

447
448
449
450
            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
thomwolf's avatar
thomwolf committed
451

452
453
            heads_to_prune = {0: [0], 2: [1, 2]}
            model.prune_heads(heads_to_prune)
454

455
456
457
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
458

459
460
461
462
            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
463

464
            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
thomwolf's avatar
thomwolf committed
465

466
467
    def test_hidden_states_output(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
468

469
470
471
472
        for model_class in self.all_model_classes:
            config.output_hidden_states = True
            config.output_attentions = False
            model = model_class(config)
473
            model.to(torch_device)
thomwolf's avatar
thomwolf committed
474
            model.eval()
thomwolf's avatar
thomwolf committed
475
            with torch.no_grad():
476
477
478
479
480
481
482
483
484
485
486
487
488
                outputs = model(**inputs_dict)
            hidden_states = outputs[-1]
            self.assertEqual(model.config.output_attentions, False)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
            self.assertListEqual(
                list(hidden_states[0].shape[-2:]),
                [
                    self.model_tester.encoder_seq_length
                    if hasattr(self.model_tester, "encoder_seq_length")
                    else self.model_tester.seq_length,
                    self.model_tester.hidden_size,
                ],
489
            )
thomwolf's avatar
thomwolf committed
490

491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
    def test_resize_tokens_embeddings(self):
        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.test_resize_embeddings:
            return

        for model_class in self.all_model_classes:
            config = copy.deepcopy(original_config)
            model = model_class(config)

            model_vocab_size = config.vocab_size
            # Retrieve the embeddings and clone theme
            model_embed = model.resize_token_embeddings(model_vocab_size)
            cloned_embeddings = model_embed.weight.clone()

            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)

            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)

            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
            models_equal = True
            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    models_equal = False

            self.assertTrue(models_equal)

    def test_model_common_attributes(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
            model.set_input_embeddings(torch.nn.Embedding(10, 10))
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))

    def test_tie_model_weights(self):
        if not self.test_torchscript:
            return

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        def check_same_values(layer_1, layer_2):
            equal = True
            for p1, p2 in zip(layer_1.weight, layer_2.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    equal = False
            return equal

        for model_class in self.all_model_classes:
            config.torchscript = True
            model_not_tied = model_class(config)
            if model_not_tied.get_output_embeddings() is None:
                continue

            params_not_tied = list(model_not_tied.parameters())

            config_tied = copy.deepcopy(config)
            config_tied.torchscript = False
            model_tied = model_class(config_tied)
            params_tied = list(model_tied.parameters())

            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertGreater(len(params_not_tied), len(params_tied))
            # self.assertTrue(check_same_values(embeddings, decoding))

            # # Check that after modification, they remain the same.
            # embeddings.weight.data.div_(2)
            # # Check that the embedding layer and decoding layer are the same in size and in value
            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            # self.assertTrue(check_same_values(embeddings, decoding))

            # # Check that after modification, they remain the same.
            # decoding.weight.data.div_(4)
            # # Check that the embedding layer and decoding layer are the same in size and in value
            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            # self.assertTrue(check_same_values(embeddings, decoding))

            # Check that after resize they remain tied.
            model_tied.resize_token_embeddings(config.vocab_size + 10)
            params_tied_2 = list(model_tied.parameters())
            self.assertGreater(len(params_not_tied), len(params_tied))
            self.assertEqual(len(params_tied_2), len(params_tied))

            # decoding.weight.data.mul_(20)
            # # Check that the embedding layer and decoding layer are the same in size and in value
            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))

    def test_inputs_embeds(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.is_encoder_decoder:
            input_ids = inputs_dict["input_ids"]
            del inputs_dict["input_ids"]
        else:
            encoder_input_ids = inputs_dict["encoder_input_ids"]
            decoder_input_ids = inputs_dict["decoder_input_ids"]
            del inputs_dict["encoder_input_ids"]
            del inputs_dict["decoder_input_ids"]

        for model_class in self.all_model_classes:
            model = model_class(config)
601
            model.to(torch_device)
thomwolf's avatar
thomwolf committed
602
            model.eval()
603
604
605
606
607
608
609
610

            wte = model.get_input_embeddings()
            if not self.is_encoder_decoder:
                inputs_dict["inputs_embeds"] = wte(input_ids)
            else:
                inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
                inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)

thomwolf's avatar
thomwolf committed
611
            with torch.no_grad():
612
613
614
                outputs = model(**inputs_dict)


thomwolf's avatar
thomwolf committed
615
616
617
618
619
620
class ConfigTester(object):
    def __init__(self, parent, config_class=None, **kwargs):
        self.parent = parent
        self.config_class = config_class
        self.inputs_dict = kwargs

thomwolf's avatar
thomwolf committed
621
622
    def create_and_test_config_common_properties(self):
        config = self.config_class(**self.inputs_dict)
623
624
625
626
        self.parent.assertTrue(hasattr(config, "vocab_size"))
        self.parent.assertTrue(hasattr(config, "hidden_size"))
        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
thomwolf's avatar
thomwolf committed
627

thomwolf's avatar
thomwolf committed
628
629
630
631
632
633
634
635
    def create_and_test_config_to_json_string(self):
        config = self.config_class(**self.inputs_dict)
        obj = json.loads(config.to_json_string())
        for key, value in self.inputs_dict.items():
            self.parent.assertEqual(obj[key], value)

    def create_and_test_config_to_json_file(self):
        config_first = self.config_class(**self.inputs_dict)
636
        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
thomwolf's avatar
thomwolf committed
637
638
639
640
641
642
        config_first.to_json_file(json_file_path)
        config_second = self.config_class.from_json_file(json_file_path)
        os.remove(json_file_path)
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())

    def run_common_tests(self):
thomwolf's avatar
thomwolf committed
643
        self.create_and_test_config_common_properties()
thomwolf's avatar
thomwolf committed
644
645
646
647
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()


648
global_rng = random.Random()
thomwolf's avatar
thomwolf committed
649
650


thomwolf's avatar
thomwolf committed
651
652
653
def ids_tensor(shape, vocab_size, rng=None, name=None):
    """Creates a random int32 tensor of the shape within the vocab size."""
    if rng is None:
654
        rng = global_rng
thomwolf's avatar
thomwolf committed
655

thomwolf's avatar
thomwolf committed
656
657
658
    total_dims = 1
    for dim in shape:
        total_dims *= dim
thomwolf's avatar
thomwolf committed
659

thomwolf's avatar
thomwolf committed
660
661
662
    values = []
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))
thomwolf's avatar
thomwolf committed
663

664
    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
thomwolf's avatar
thomwolf committed
665
666


667
668
669
670
671
672
673
674
675
676
677
678
679
def floats_tensor(shape, scale=1.0, rng=None, name=None):
    """Creates a random float32 tensor of the shape within the vocab size."""
    if rng is None:
        rng = global_rng

    total_dims = 1
    for dim in shape:
        total_dims *= dim

    values = []
    for _ in range(total_dims):
        values.append(rng.random() * scale)

680
    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
681
682


683
@require_torch
thomwolf's avatar
thomwolf committed
684
class ModelUtilsTest(unittest.TestCase):
685
    @slow
thomwolf's avatar
thomwolf committed
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = BertConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, PretrainedConfig)

            model = BertModel.from_pretrained(model_name)
            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, PreTrainedModel)
            for value in loading_info.values():
                self.assertEqual(len(value), 0)

            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)