"backend/vscode:/vscode.git/clone" did not exist on "19700e90748fe7ccbe410a8adbd3ba74cba429d7"
test_tokenization_auto.py 5.35 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

thomwolf's avatar
thomwolf committed
16

Aymeric Augustin's avatar
Aymeric Augustin committed
17
import unittest
thomwolf's avatar
thomwolf committed
18

Aymeric Augustin's avatar
Aymeric Augustin committed
19
20
21
22
23
from transformers import (
    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
    AutoTokenizer,
    BertTokenizer,
24
    BertTokenizerFast,
Aymeric Augustin's avatar
Aymeric Augustin committed
25
    GPT2Tokenizer,
26
    GPT2TokenizerFast,
Julien Chaumond's avatar
Julien Chaumond committed
27
    RobertaTokenizer,
28
    RobertaTokenizerFast,
Aymeric Augustin's avatar
Aymeric Augustin committed
29
)
30
31
32
33
34
35
from transformers.configuration_auto import AutoConfig
from transformers.configuration_roberta import RobertaConfig
from transformers.testing_utils import (
    DUMMY_DIFF_TOKENIZER_IDENTIFIER,
    DUMMY_UNKWOWN_IDENTIFIER,
    SMALL_MODEL_IDENTIFIER,
36
    require_tokenizers,
37
)
Lysandre's avatar
Lysandre committed
38
from transformers.tokenization_auto import TOKENIZER_MAPPING
thomwolf's avatar
thomwolf committed
39
40
41


class AutoTokenizerTest(unittest.TestCase):
42
    # @slow
thomwolf's avatar
thomwolf committed
43
    def test_tokenizer_from_pretrained(self):
44
        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
thomwolf's avatar
thomwolf committed
45
46
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
47
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
thomwolf's avatar
thomwolf committed
48
49
            self.assertGreater(len(tokenizer), 0)

50
        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
thomwolf's avatar
thomwolf committed
51
52
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
53
            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
thomwolf's avatar
thomwolf committed
54
55
            self.assertGreater(len(tokenizer), 0)

Julien Chaumond's avatar
Julien Chaumond committed
56
57
    def test_tokenizer_from_pretrained_identifier(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
58
59
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)
Julien Chaumond's avatar
Julien Chaumond committed
60
61
62

    def test_tokenizer_from_model_type(self):
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
63
64
        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 20)
65

66
67
68
69
70
71
72
73
    def test_tokenizer_from_tokenizer_class(self):
        config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
        self.assertIsInstance(config, RobertaConfig)
        # Check that tokenizer_type ≠ model_type
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)

74
    @require_tokenizers
75
    def test_tokenizer_identifier_with_correct_config(self):
76
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
77
            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
78
79
80
81
82
83
84
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))

            if isinstance(tokenizer, BertTokenizer):
                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
            else:
                self.assertEqual(tokenizer.do_lower_case, False)

85
86
            self.assertEqual(tokenizer.max_len, 512)

87
    @require_tokenizers
88
    def test_tokenizer_identifier_non_existent(self):
89
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
90
91
            with self.assertRaises(EnvironmentError):
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
Lysandre's avatar
Lysandre committed
92
93
94
95
96
97
98
99
100

    def test_parents_and_children_in_mappings(self):
        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
        # by the parents and will return the wrong configuration type when using auto models

        mappings = (TOKENIZER_MAPPING,)

        for mapping in mappings:
            mapping = tuple(mapping.items())
101
102
            for index, (child_config, (child_model_py, child_model_fast)) in enumerate(mapping[1:]):
                for parent_config, (parent_model_py, parent_model_fast) in mapping[: index + 1]:
Lysandre's avatar
Lysandre committed
103
104
105
106
                    with self.subTest(
                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
                    ):
                        self.assertFalse(issubclass(child_config, parent_config))
107
108
109
110

                        # Check for Slow tokenizer implementation if provided
                        if child_model_py and parent_model_py:
                            self.assertFalse(issubclass(child_model_py, parent_model_py))
111
112
113
114
115

                        # Check for Fast tokenizer implementation if provided
                        if child_model_fast and parent_model_fast:
                            self.assertFalse(issubclass(child_model_fast, parent_model_fast))

116
    @require_tokenizers
117
    def test_from_pretrained_use_fast_toggle(self):
118
119
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer)
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast)