test_modeling_tf_common.py 16.7 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
thomwolf's avatar
thomwolf committed
15
from __future__ import absolute_import, division, print_function
thomwolf's avatar
thomwolf committed
16
17

import copy
Aymeric Augustin's avatar
Aymeric Augustin committed
18
import os
thomwolf's avatar
thomwolf committed
19
import random
thomwolf's avatar
thomwolf committed
20
import shutil
Aymeric Augustin's avatar
Aymeric Augustin committed
21
22
import sys
import tempfile
thomwolf's avatar
thomwolf committed
23
24
import unittest

25
from transformers import is_tf_available, is_torch_available
26

27
from .utils import require_tf
28

Aymeric Augustin's avatar
Aymeric Augustin committed
29

30
if is_tf_available():
thomwolf's avatar
thomwolf committed
31
    import tensorflow as tf
thomwolf's avatar
thomwolf committed
32
    import numpy as np
33

34
    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
thomwolf's avatar
thomwolf committed
35

36
37
38
39
if sys.version_info[0] == 2:

    class TemporaryDirectory(object):
        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
40

41
42
43
        def __enter__(self):
            self.name = tempfile.mkdtemp()
            return self.name
44

45
46
        def __exit__(self, exc_type, exc_value, traceback):
            shutil.rmtree(self.name)
47
48


49
50
51
else:
    TemporaryDirectory = tempfile.TemporaryDirectory
    unicode = str
thomwolf's avatar
thomwolf committed
52

53

thomwolf's avatar
thomwolf committed
54
55
56
def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
57
        if "_range" in key or "_std" in key:
thomwolf's avatar
thomwolf committed
58
59
60
61
            setattr(configs_no_init, key, 0.0)
    return configs_no_init


62
class TFCommonTestCases:
63
    @require_tf
thomwolf's avatar
thomwolf committed
64
65
66
67
68
69
70
    class TFCommonModelTester(unittest.TestCase):

        model_tester = None
        all_model_classes = ()
        test_torchscript = True
        test_pruning = True
        test_resize_embeddings = True
71
        is_encoder_decoder = False
thomwolf's avatar
thomwolf committed
72
73
74
75
76
77
78
79
80
81
82
83
84

        def test_initialization(self):
            pass
            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            # configs_no_init = _config_zero_init(config)
            # for model_class in self.all_model_classes:
            #     model = model_class(config=configs_no_init)
            #     for name, param in model.named_parameters():
            #         if param.requires_grad:
            #             self.assertIn(param.data.mean().item(), [0.0, 1.0],
            #             msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))

85
86
87
88
89
90
91
92
93
94
95
        def test_save_load(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            for model_class in self.all_model_classes:
                model = model_class(config)
                outputs = model(inputs_dict)

                with TemporaryDirectory() as tmpdirname:
                    model.save_pretrained(tmpdirname)
                    model = model_class.from_pretrained(tmpdirname)
                    after_outputs = model(inputs_dict)
thomwolf's avatar
no nans  
thomwolf committed
96
97
98
99
100
101
102

                    # Make sure we don't have nans
                    out_1 = after_outputs[0].numpy()
                    out_2 = outputs[0].numpy()
                    out_1 = out_1[~np.isnan(out_1)]
                    out_2 = out_2[~np.isnan(out_2)]
                    max_diff = np.amax(np.abs(out_1 - out_2))
103
                    self.assertLessEqual(max_diff, 1e-5)
thomwolf's avatar
thomwolf committed
104

105
106
        def test_pt_tf_model_equivalence(self):
            if not is_torch_available():
thomwolf's avatar
thomwolf committed
107
108
                return

109
            import torch
110
            import transformers
111
112
113
114
115

            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            for model_class in self.all_model_classes:
                pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
116
                pt_model_class = getattr(transformers, pt_model_class_name)
117

118
119
120
                config.output_hidden_states = True
                tf_model = model_class(config)
                pt_model = pt_model_class(config)
121

122
                # Check we can load pt model in tf and vice-versa with model => model functions
123
124
                tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
                pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
125

126
127
                # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                pt_model.eval()
128
129
130
                pt_inputs_dict = dict(
                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
                )
131
132
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
thomwolf's avatar
thomwolf committed
133
134
135
136
137
138
                tfo = tf_model(inputs_dict, training=False)
                tf_hidden_states = tfo[0].numpy()
                pt_hidden_states = pto[0].numpy()
                tf_hidden_states[np.isnan(tf_hidden_states)] = 0
                pt_hidden_states[np.isnan(pt_hidden_states)] = 0
                max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
139
140
141
142
                self.assertLessEqual(max_diff, 2e-2)

                # Check we can load pt model in tf and vice-versa with checkpoint => model functions
                with TemporaryDirectory() as tmpdirname:
143
                    pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
144
145
146
                    torch.save(pt_model.state_dict(), pt_checkpoint_path)
                    tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)

147
                    tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
148
149
150
                    tf_model.save_weights(tf_checkpoint_path)
                    pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)

151
152
                # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                pt_model.eval()
153
154
155
                pt_inputs_dict = dict(
                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
                )
156
157
158
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
                tfo = tf_model(inputs_dict)
thomwolf's avatar
thomwolf committed
159
160
161
162
163
                tfo = tfo[0].numpy()
                pto = pto[0].numpy()
                tfo[np.isnan(tfo)] = 0
                pto[np.isnan(pto)] = 0
                max_diff = np.amax(np.abs(tfo - pto))
thomwolf's avatar
thomwolf committed
164
                self.assertLessEqual(max_diff, 2e-2)
165

166
167
168
        def test_compile_tf_model(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

169
            if self.is_encoder_decoder:
170
171
172
173
174
175
176
177
                input_ids = {
                    "decoder_input_ids": tf.keras.Input(
                        batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"
                    ),
                    "encoder_input_ids": tf.keras.Input(
                        batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"
                    ),
                }
178
            else:
179
                input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
180
181
            optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
182
            metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
183
184
185
186

            for model_class in self.all_model_classes:
                # Prepare our model
                model = model_class(config)
187

188
189
190
191
192
193
194
195
196
197
                # Let's load it from the disk to be sure we can use pretrained weights
                with TemporaryDirectory() as tmpdirname:
                    outputs = model(inputs_dict)  # build the model
                    model.save_pretrained(tmpdirname)
                    model = model_class.from_pretrained(tmpdirname)

                outputs_dict = model(input_ids)
                hidden_states = outputs_dict[0]

                # Add a dense layer on top to test intetgration with other keras modules
198
                outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
199
200
201
202
203

                # Compile extended model
                extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
                extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

thomwolf's avatar
thomwolf committed
204
205
206
207
208
209
210
211
        def test_keyword_and_dict_args(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            for model_class in self.all_model_classes:
                model = model_class(config)
                outputs_dict = model(inputs_dict)

                inputs_keywords = copy.deepcopy(inputs_dict)
212
213
214
                input_ids = inputs_keywords.pop(
                    "input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None
                )
thomwolf's avatar
thomwolf committed
215
216
217
218
219
220
221
                outputs_keywords = model(input_ids, **inputs_keywords)

                output_dict = outputs_dict[0].numpy()
                output_keywords = outputs_keywords[0].numpy()

                self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)

thomwolf's avatar
thomwolf committed
222
        def test_attention_outputs(self):
223
224
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
            decoder_seq_length = (
                self.model_tester.decoder_seq_length
                if hasattr(self.model_tester, "decoder_seq_length")
                else self.model_tester.seq_length
            )
            encoder_seq_length = (
                self.model_tester.encoder_seq_length
                if hasattr(self.model_tester, "encoder_seq_length")
                else self.model_tester.seq_length
            )
            decoder_key_length = (
                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
            )
            encoder_key_length = (
                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
            )
thomwolf's avatar
thomwolf committed
241

242
243
244
245
246
247
248
249
250
251
252
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
                outputs = model(inputs_dict)
                attentions = [t.numpy() for t in outputs[-1]]
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
253
254
                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                )
255
256
                out_len = len(outputs)

257
258
                if self.is_encoder_decoder:
                    self.assertEqual(out_len % 2, 0)
259
                    decoder_attentions = outputs[(out_len // 2) - 1]
260
261
262
263
264
                    self.assertEqual(model.config.output_attentions, True)
                    self.assertEqual(model.config.output_hidden_states, False)
                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                    self.assertListEqual(
                        list(decoder_attentions[0].shape[-3:]),
265
266
                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
                    )
267

268
269
270
271
272
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                outputs = model(inputs_dict)
273
                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
274
275
276
277
278
279
280
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)

                attentions = [t.numpy() for t in outputs[-1]]
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
281
282
                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                )
thomwolf's avatar
thomwolf committed
283
284

        def test_hidden_states_output(self):
285
286
287
288
289
290
291
292
293
294
295
296
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            for model_class in self.all_model_classes:
                config.output_hidden_states = True
                config.output_attentions = False
                model = model_class(config)
                outputs = model(inputs_dict)
                hidden_states = [t.numpy() for t in outputs[-1]]
                self.assertEqual(model.config.output_attentions, False)
                self.assertEqual(model.config.output_hidden_states, True)
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
297
298
                    list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
                )
thomwolf's avatar
thomwolf committed
299

300
301
        def test_model_common_attributes(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
302

303
304
305
306
            for model_class in self.all_model_classes:
                model = model_class(config)
                assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
                x = model.get_output_embeddings()
Julien Chaumond's avatar
Ooopsie  
Julien Chaumond committed
307
                assert x is None or isinstance(x, tf.keras.layers.Layer)
thomwolf's avatar
thomwolf committed
308

309
310
311
312
313
314
        def test_determinism(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

            for model_class in self.all_model_classes:
                model = model_class(config)
                first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
thomwolf's avatar
thomwolf committed
315
316
317
318
319
320
                out_1 = first.numpy()
                out_2 = second.numpy()
                out_1 = out_1[~np.isnan(out_1)]
                out_2 = out_2[~np.isnan(out_2)]
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
321

thomwolf's avatar
thomwolf committed
322
323
324
325
326
327
        def _get_embeds(self, wte, input_ids):
            # ^^ In our TF models, the input_embeddings can take slightly different forms,
            # so we try a few of them.
            # We used to fall back to just synthetically creating a dummy tensor of ones:
            try:
                x = wte(input_ids, mode="embedding")
328
            except Exception:
thomwolf's avatar
thomwolf committed
329
330
                try:
                    x = wte([input_ids], mode="embedding")
331
                except Exception:
thomwolf's avatar
thomwolf committed
332
333
                    try:
                        x = wte([input_ids, None, None, None], mode="embedding")
334
                    except Exception:
thomwolf's avatar
thomwolf committed
335
336
337
338
339
340
                        if hasattr(self.model_tester, "embedding_size"):
                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
                        else:
                            x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
            return x

341
342
        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
thomwolf's avatar
thomwolf committed
343
344
345
346
347
348
349
350
            if not self.is_encoder_decoder:
                input_ids = inputs_dict["input_ids"]
                del inputs_dict["input_ids"]
            else:
                encoder_input_ids = inputs_dict["encoder_input_ids"]
                decoder_input_ids = inputs_dict["decoder_input_ids"]
                del inputs_dict["encoder_input_ids"]
                del inputs_dict["decoder_input_ids"]
351
352
353
354
355

            for model_class in self.all_model_classes:
                model = model_class(config)

                wte = model.get_input_embeddings()
thomwolf's avatar
thomwolf committed
356
357
358
359
360
361
                if not self.is_encoder_decoder:
                    inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
                else:
                    inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
                    inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)

362
363
                outputs = model(inputs_dict)

thomwolf's avatar
thomwolf committed
364

thomwolf's avatar
thomwolf committed
365
def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
thomwolf's avatar
thomwolf committed
366
367
368
369
370
371
372
373
374
375
376
377
    """Creates a random int32 tensor of the shape within the vocab size."""
    if rng is None:
        rng = random.Random()

    total_dims = 1
    for dim in shape:
        total_dims *= dim

    values = []
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))

378
    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
thomwolf's avatar
thomwolf committed
379
380

    return output