"examples/vscode:/vscode.git/clone" did not exist on "17c7154a94ad8e7e92b0f2625e5d315b8507c624"
test_modeling_common.py 26.8 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

thomwolf's avatar
thomwolf committed
16

17
import copy
Aymeric Augustin's avatar
Aymeric Augustin committed
18
import logging
19
import os.path
Aymeric Augustin's avatar
Aymeric Augustin committed
20
import random
21
import tempfile
thomwolf's avatar
thomwolf committed
22
23
import unittest

24
from transformers import is_torch_available
25

26
from .utils import require_torch, slow, torch_device
27

Aymeric Augustin's avatar
Aymeric Augustin committed
28

29
if is_torch_available():
thomwolf's avatar
thomwolf committed
30
    import torch
31
    import numpy as np
thomwolf's avatar
thomwolf committed
32

33
34
35
36
37
38
39
40
    from transformers import (
        AdaptiveEmbedding,
        PretrainedConfig,
        PreTrainedModel,
        BertModel,
        BertConfig,
        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    )
thomwolf's avatar
thomwolf committed
41

42

43
44
45
def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
46
        if "_range" in key or "_std" in key or "initializer_factor" in key:
47
48
49
            setattr(configs_no_init, key, 0.0)
    return configs_no_init

thomwolf's avatar
thomwolf committed
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@require_torch
class ModelTesterMixin:

    model_tester = None
    all_model_classes = ()
    test_torchscript = True
    test_pruning = True
    test_resize_embeddings = True
    test_head_masking = True
    is_encoder_decoder = False

    def test_save_load(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs_dict)
            out_2 = outputs[0].numpy()
            out_2[np.isnan(out_2)] = 0
73

74
            with tempfile.TemporaryDirectory() as tmpdirname:
75
76
                model.save_pretrained(tmpdirname)
                model = model_class.from_pretrained(tmpdirname)
77
                model.to(torch_device)
78
                with torch.no_grad():
79
                    after_outputs = model(**inputs_dict)
thomwolf's avatar
thomwolf committed
80

81
82
83
                # Make sure we don't have nans
                out_1 = after_outputs[0].cpu().numpy()
                out_1[np.isnan(out_1)] = 0
thomwolf's avatar
thomwolf committed
84
85
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
86

87
88
89
90
91
92
93
94
95
96
97
98
99
    def test_initialization(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        configs_no_init = _config_zero_init(config)
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            for name, param in model.named_parameters():
                if param.requires_grad:
                    self.assertIn(
                        param.data.mean().item(),
                        [0.0, 1.0],
                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
                    )
thomwolf's avatar
thomwolf committed
100

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    def test_determinism(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                first = model(**inputs_dict)[0]
                second = model(**inputs_dict)[0]
            out_1 = first.cpu().numpy()
            out_2 = second.cpu().numpy()
            out_1 = out_1[~np.isnan(out_1)]
            out_2 = out_2[~np.isnan(out_2)]
            max_diff = np.amax(np.abs(out_1 - out_2))
            self.assertLessEqual(max_diff, 1e-5)

    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
sshleifer's avatar
sshleifer committed
120
        seq_len = getattr(self.model_tester, "seq_length", None)
sshleifer's avatar
sshleifer committed
121
122
        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
123
124
        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

        for model_class in self.all_model_classes:
            config.output_attentions = True
            config.output_hidden_states = False
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, False)
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(attentions[0].shape[-3:]),
                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
141
            )
142
            out_len = len(outputs)
thomwolf's avatar
thomwolf committed
143

144
            if self.is_encoder_decoder:
Sam Shleifer's avatar
Sam Shleifer committed
145
146
147
148
149
150
151
152
153
154
155
                correct_outlen = (
                    4  # decoder_features_or_logits, decoder_attentions, encoder_features, encoder_attentions
                )
                decoder_attention_idx = 1
                if "lm_labels" in inputs_dict or "decoder_lm_labels" in inputs_dict:  # loss will come first
                    correct_outlen += 1  # compute loss
                    decoder_attention_idx += 1
                self.assertEqual(out_len, correct_outlen)

                decoder_attentions = outputs[decoder_attention_idx]
                self.assertIsInstance(decoder_attentions, (list, tuple))
156
                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
thomwolf's avatar
thomwolf committed
157
                self.assertListEqual(
158
159
                    list(decoder_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
160
                )
thomwolf's avatar
thomwolf committed
161

162
            # Check attention is always last and order is fine
thomwolf's avatar
thomwolf committed
163
164
            config.output_attentions = True
            config.output_hidden_states = True
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs_dict)
            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)

            self_attentions = outputs[-1]
            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(self_attentions[0].shape[-3:]),
                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
            )
thomwolf's avatar
thomwolf committed
180

181
182
    def test_torchscript(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
183

184
        self._create_and_check_torchscript(config, inputs_dict)
thomwolf's avatar
thomwolf committed
185

186
187
    def test_torchscript_output_attentions(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
188

189
190
        config.output_attentions = True
        self._create_and_check_torchscript(config, inputs_dict)
thomwolf's avatar
thomwolf committed
191

192
193
    def test_torchscript_output_hidden_state(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
194

195
196
        config.output_hidden_states = True
        self._create_and_check_torchscript(config, inputs_dict)
thomwolf's avatar
thomwolf committed
197

198
199
200
    def _create_and_check_torchscript(self, config, inputs_dict):
        if not self.test_torchscript:
            return
201

202
203
204
205
206
207
208
        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
        configs_no_init.torchscript = True
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            model.to(torch_device)
            model.eval()
            inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
thomwolf's avatar
thomwolf committed
209

210
211
212
213
            try:
                traced_gpt2 = torch.jit.trace(model, inputs)
            except RuntimeError:
                self.fail("Couldn't trace module.")
thomwolf's avatar
thomwolf committed
214

215
            with tempfile.TemporaryDirectory() as tmp_dir_name:
216
                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
thomwolf's avatar
thomwolf committed
217

218
219
220
221
                try:
                    torch.jit.save(traced_gpt2, pt_file_name)
                except Exception:
                    self.fail("Couldn't save module.")
thomwolf's avatar
thomwolf committed
222

223
224
225
226
                try:
                    loaded_model = torch.jit.load(pt_file_name)
                except Exception:
                    self.fail("Couldn't load module.")
LysandreJik's avatar
LysandreJik committed
227

228
229
            model.to(torch_device)
            model.eval()
thomwolf's avatar
thomwolf committed
230

231
232
            loaded_model.to(torch_device)
            loaded_model.eval()
thomwolf's avatar
thomwolf committed
233

234
235
236
237
            model_state_dict = model.state_dict()
            loaded_model_state_dict = loaded_model.state_dict()

            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
thomwolf's avatar
thomwolf committed
238

239
            models_equal = True
240
241
            for layer_name, p1 in model_state_dict.items():
                p2 = loaded_model_state_dict[layer_name]
242
243
                if p1.data.ne(p2.data).sum() > 0:
                    models_equal = False
thomwolf's avatar
thomwolf committed
244

245
            self.assertTrue(models_equal)
thomwolf's avatar
thomwolf committed
246

247
248
249
    def test_headmasking(self):
        if not self.test_head_masking:
            return
250

251
252
253
        global_rng.seed(42)
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        global_rng.seed()
LysandreJik's avatar
LysandreJik committed
254

255
256
257
258
259
260
261
        config.output_attentions = True
        config.output_hidden_states = True
        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            model.to(torch_device)
            model.eval()
LysandreJik's avatar
LysandreJik committed
262

263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
            # Prepare head_mask
            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
            head_mask = torch.ones(
                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
            )
            head_mask[0, 0] = 0
            head_mask[-1, :-1] = 0
            head_mask.requires_grad_(requires_grad=True)
            inputs = inputs_dict.copy()
            inputs["head_mask"] = head_mask

            outputs = model(**inputs)

            # Test that we can get a gradient back for importance score computation
            output = sum(t.sum() for t in outputs[0])
            output = output.sum()
            output.backward()
            multihead_outputs = head_mask.grad

            attentions = outputs[-1]

            # Remove Nan
            for t in attentions:
                self.assertLess(
                    torch.sum(torch.isnan(t)), t.numel() / 4
                )  # Check we don't have more than 25% nans (arbitrary)
            attentions = [
                t.masked_fill(torch.isnan(t), 0.0) for t in attentions
            ]  # remove them (the test is less complete)

            self.assertIsNotNone(multihead_outputs)
            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
            self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
            self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
            self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
            self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
            self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)

    def test_head_pruning(self):
        if not self.test_pruning:
            return

        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
307

308
309
            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
310

311
312
313
314
315
316
317
318
319
            config.output_attentions = True
            config.output_hidden_states = False
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
            model.prune_heads(heads_to_prune)
            with torch.no_grad():
                outputs = model(**inputs_dict)
320

321
            attentions = outputs[-1]
322

323
324
325
            self.assertEqual(attentions[0].shape[-3], 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
LysandreJik's avatar
LysandreJik committed
326

327
328
329
    def test_head_pruning_save_load_from_pretrained(self):
        if not self.test_pruning:
            return
LysandreJik's avatar
LysandreJik committed
330

331
332
333
334
335
        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
336

337
338
339
340
341
342
343
            config.output_attentions = True
            config.output_hidden_states = False
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
            model.prune_heads(heads_to_prune)
344

345
            with tempfile.TemporaryDirectory() as temp_dir_name:
346
347
                model.save_pretrained(temp_dir_name)
                model = model_class.from_pretrained(temp_dir_name)
348
                model.to(torch_device)
349

350
351
352
353
354
355
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
            self.assertEqual(attentions[0].shape[-3], 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
356

357
358
359
    def test_head_pruning_save_load_from_config_init(self):
        if not self.test_pruning:
            return
360

361
362
        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
363

364
365
            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
366

367
368
            config.output_attentions = True
            config.output_hidden_states = False
369

370
371
            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
            config.pruned_heads = heads_to_prune
372

373
374
375
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
376

377
378
379
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
380

381
382
383
            self.assertEqual(attentions[0].shape[-3], 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
384

385
386
387
    def test_head_pruning_integration(self):
        if not self.test_pruning:
            return
388

389
390
        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
391

392
393
            if "head_mask" in inputs_dict:
                del inputs_dict["head_mask"]
394

395
396
            config.output_attentions = True
            config.output_hidden_states = False
397

398
399
            heads_to_prune = {0: [0], 1: [1, 2]}
            config.pruned_heads = heads_to_prune
400

401
402
403
            model = model_class(config=config)
            model.to(torch_device)
            model.eval()
404

405
406
407
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
408

409
410
411
412
            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
thomwolf's avatar
thomwolf committed
413

414
            with tempfile.TemporaryDirectory() as temp_dir_name:
415
416
                model.save_pretrained(temp_dir_name)
                model = model_class.from_pretrained(temp_dir_name)
417
                model.to(torch_device)
thomwolf's avatar
thomwolf committed
418

419
420
421
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
LysandreJik's avatar
LysandreJik committed
422

423
424
425
426
            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
thomwolf's avatar
thomwolf committed
427

428
429
            heads_to_prune = {0: [0], 2: [1, 2]}
            model.prune_heads(heads_to_prune)
430

431
432
433
            with torch.no_grad():
                outputs = model(**inputs_dict)
            attentions = outputs[-1]
434

435
436
437
438
            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
439

440
            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
thomwolf's avatar
thomwolf committed
441

442
443
    def test_hidden_states_output(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
444

445
446
447
448
        for model_class in self.all_model_classes:
            config.output_hidden_states = True
            config.output_attentions = False
            model = model_class(config)
449
            model.to(torch_device)
thomwolf's avatar
thomwolf committed
450
            model.eval()
thomwolf's avatar
thomwolf committed
451
            with torch.no_grad():
452
453
454
455
456
457
458
459
460
461
462
463
464
                outputs = model(**inputs_dict)
            hidden_states = outputs[-1]
            self.assertEqual(model.config.output_attentions, False)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
            self.assertListEqual(
                list(hidden_states[0].shape[-2:]),
                [
                    self.model_tester.encoder_seq_length
                    if hasattr(self.model_tester, "encoder_seq_length")
                    else self.model_tester.seq_length,
                    self.model_tester.hidden_size,
                ],
465
            )
thomwolf's avatar
thomwolf committed
466

467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
    def test_resize_tokens_embeddings(self):
        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.test_resize_embeddings:
            return

        for model_class in self.all_model_classes:
            config = copy.deepcopy(original_config)
            model = model_class(config)

            model_vocab_size = config.vocab_size
            # Retrieve the embeddings and clone theme
            model_embed = model.resize_token_embeddings(model_vocab_size)
            cloned_embeddings = model_embed.weight.clone()

            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
486
487
            # Check that the model can still do a forward pass successfully (every parameter should be resized)
            model(**inputs_dict)
488
489
490
491
492
493
494

            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)

495
496
497
498
499
            # Check that the model can still do a forward pass successfully (every parameter should be resized)
            # Input ids should be clamped to the maximum size of the vocabulary
            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
            model(**inputs_dict)

500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
            models_equal = True
            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    models_equal = False

            self.assertTrue(models_equal)

    def test_model_common_attributes(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
            model.set_input_embeddings(torch.nn.Embedding(10, 10))
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))

    def test_tie_model_weights(self):
        if not self.test_torchscript:
            return

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        def check_same_values(layer_1, layer_2):
            equal = True
            for p1, p2 in zip(layer_1.weight, layer_2.weight):
                if p1.data.ne(p2.data).sum() > 0:
                    equal = False
            return equal

        for model_class in self.all_model_classes:
            config.torchscript = True
            model_not_tied = model_class(config)
            if model_not_tied.get_output_embeddings() is None:
                continue

            params_not_tied = list(model_not_tied.parameters())

            config_tied = copy.deepcopy(config)
            config_tied.torchscript = False
            model_tied = model_class(config_tied)
            params_tied = list(model_tied.parameters())

            # Check that the embedding layer and decoding layer are the same in size and in value
            self.assertGreater(len(params_not_tied), len(params_tied))
            # self.assertTrue(check_same_values(embeddings, decoding))

            # # Check that after modification, they remain the same.
            # embeddings.weight.data.div_(2)
            # # Check that the embedding layer and decoding layer are the same in size and in value
            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            # self.assertTrue(check_same_values(embeddings, decoding))

            # # Check that after modification, they remain the same.
            # decoding.weight.data.div_(4)
            # # Check that the embedding layer and decoding layer are the same in size and in value
            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
            # self.assertTrue(check_same_values(embeddings, decoding))

            # Check that after resize they remain tied.
            model_tied.resize_token_embeddings(config.vocab_size + 10)
            params_tied_2 = list(model_tied.parameters())
            self.assertGreater(len(params_not_tied), len(params_tied))
            self.assertEqual(len(params_tied_2), len(params_tied))

            # decoding.weight.data.mul_(20)
            # # Check that the embedding layer and decoding layer are the same in size and in value
            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))

    def test_inputs_embeds(self):
Sam Shleifer's avatar
Sam Shleifer committed
572

573
574
575
576
577
578
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.is_encoder_decoder:
            input_ids = inputs_dict["input_ids"]
            del inputs_dict["input_ids"]
        else:
            encoder_input_ids = inputs_dict["encoder_input_ids"]
Sam Shleifer's avatar
Sam Shleifer committed
579
            decoder_input_ids = inputs_dict.get("decoder_input_ids", encoder_input_ids)
580
            del inputs_dict["encoder_input_ids"]
Sam Shleifer's avatar
Sam Shleifer committed
581
            inputs_dict.pop("decoder_input_ids", None)
582
583
584

        for model_class in self.all_model_classes:
            model = model_class(config)
585
            model.to(torch_device)
thomwolf's avatar
thomwolf committed
586
            model.eval()
587
588
589
590
591
592
593
594

            wte = model.get_input_embeddings()
            if not self.is_encoder_decoder:
                inputs_dict["inputs_embeds"] = wte(input_ids)
            else:
                inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
                inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)

thomwolf's avatar
thomwolf committed
595
            with torch.no_grad():
596
                model(**inputs_dict)
597
598


599
global_rng = random.Random()
thomwolf's avatar
thomwolf committed
600
601


thomwolf's avatar
thomwolf committed
602
603
604
def ids_tensor(shape, vocab_size, rng=None, name=None):
    """Creates a random int32 tensor of the shape within the vocab size."""
    if rng is None:
605
        rng = global_rng
thomwolf's avatar
thomwolf committed
606

thomwolf's avatar
thomwolf committed
607
608
609
    total_dims = 1
    for dim in shape:
        total_dims *= dim
thomwolf's avatar
thomwolf committed
610

thomwolf's avatar
thomwolf committed
611
612
613
    values = []
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))
thomwolf's avatar
thomwolf committed
614

615
    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
thomwolf's avatar
thomwolf committed
616
617


618
619
620
621
622
623
624
625
626
627
628
629
630
def floats_tensor(shape, scale=1.0, rng=None, name=None):
    """Creates a random float32 tensor of the shape within the vocab size."""
    if rng is None:
        rng = global_rng

    total_dims = 1
    for dim in shape:
        total_dims *= dim

    values = []
    for _ in range(total_dims):
        values.append(rng.random() * scale)

631
    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
632
633


634
@require_torch
thomwolf's avatar
thomwolf committed
635
class ModelUtilsTest(unittest.TestCase):
636
    @slow
thomwolf's avatar
thomwolf committed
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            config = BertConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, PretrainedConfig)

            model = BertModel.from_pretrained(model_name)
            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, PreTrainedModel)
            for value in loading_info.values():
                self.assertEqual(len(value), 0)

            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
            self.assertEqual(model.config.output_attentions, True)
            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(model.config, config)