test_tokenization_auto.py 3.85 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

thomwolf's avatar
thomwolf committed
16
17

import logging
Aymeric Augustin's avatar
Aymeric Augustin committed
18
import unittest
thomwolf's avatar
thomwolf committed
19

Aymeric Augustin's avatar
Aymeric Augustin committed
20
21
22
23
24
25
from transformers import (
    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
    AutoTokenizer,
    BertTokenizer,
    GPT2Tokenizer,
Julien Chaumond's avatar
Julien Chaumond committed
26
    RobertaTokenizer,
Aymeric Augustin's avatar
Aymeric Augustin committed
27
)
Lysandre's avatar
Lysandre committed
28
from transformers.tokenization_auto import TOKENIZER_MAPPING
thomwolf's avatar
thomwolf committed
29

Julien Chaumond's avatar
Julien Chaumond committed
30
from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, slow  # noqa: F401
31

thomwolf's avatar
thomwolf committed
32
33

class AutoTokenizerTest(unittest.TestCase):
34
    # @slow
thomwolf's avatar
thomwolf committed
35
36
    def test_tokenizer_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
37
        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
thomwolf's avatar
thomwolf committed
38
39
40
41
42
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, BertTokenizer)
            self.assertGreater(len(tokenizer), 0)

43
        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
thomwolf's avatar
thomwolf committed
44
45
46
47
48
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)
            self.assertGreater(len(tokenizer), 0)

Julien Chaumond's avatar
Julien Chaumond committed
49
50
51
52
53
    def test_tokenizer_from_pretrained_identifier(self):
        logging.basicConfig(level=logging.INFO)
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        self.assertIsInstance(tokenizer, BertTokenizer)
        self.assertEqual(len(tokenizer), 12)
Julien Chaumond's avatar
Julien Chaumond committed
54
55
56
57
58
59

    def test_tokenizer_from_model_type(self):
        logging.basicConfig(level=logging.INFO)
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
        self.assertIsInstance(tokenizer, RobertaTokenizer)
        self.assertEqual(len(tokenizer), 20)
60
61
62
63
64
65
66
67
68
69
70
71
72
73

    def test_tokenizer_identifier_with_correct_config(self):
        logging.basicConfig(level=logging.INFO)
        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
            self.assertIsInstance(tokenizer, BertTokenizer)
            self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
            self.assertEqual(tokenizer.max_len, 512)

    def test_tokenizer_identifier_non_existent(self):
        logging.basicConfig(level=logging.INFO)
        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
            with self.assertRaises(EnvironmentError):
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
Lysandre's avatar
Lysandre committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

    def test_parents_and_children_in_mappings(self):
        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
        # by the parents and will return the wrong configuration type when using auto models

        mappings = (TOKENIZER_MAPPING,)

        for mapping in mappings:
            mapping = tuple(mapping.items())
            for index, (child_config, child_model) in enumerate(mapping[1:]):
                for parent_config, parent_model in mapping[: index + 1]:
                    with self.subTest(
                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
                    ):
                        self.assertFalse(issubclass(child_config, parent_config))
                        self.assertFalse(issubclass(child_model, parent_model))