Flaubert auto tokenizer + tests

cc @julien-c

Flaubert auto tokenizer + tests
cc @julien-c
1e82cd84 · Lysandre · d18d47be · 1e82cd84 · 1e82cd84 · 1e82cd84
Commit 1e82cd84 authored Jan 31, 2020 by Lysandre
4 changed files
--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -50,8 +50,8 @@ class FlaubertConfig(XLMConfig):
                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
                with Structured Dropout. ICLR 2020)
            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the XLM model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
+                Vocabulary size of the Flaubert model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
            emb_dim (:obj:`int`, optional, defaults to 2048):
                Dimensionality of the encoder layers and the pooler layer.
            n_layer (:obj:`int`, optional, defaults to 12):

--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -25,6 +25,7 @@ from .configuration_auto import (
    CamembertConfig,
    CTRLConfig,
    DistilBertConfig,
+    FlaubertConfig,
    GPT2Config,
    OpenAIGPTConfig,
    RobertaConfig,
@@ -41,6 +42,7 @@ from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
+from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_roberta import RobertaTokenizer
@@ -67,6 +69,7 @@ TOKENIZER_MAPPING = OrderedDict(
        (GPT2Config, GPT2Tokenizer),
        (TransfoXLConfig, TransfoXLTokenizer),
        (XLNetConfig, XLNetTokenizer),
+        (FlaubertConfig, FlaubertTokenizer),
        (XLMConfig, XLMTokenizer),
        (CTRLConfig, CTRLTokenizer),
    ]

--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -39,6 +39,14 @@ if is_torch_available():
        BertForQuestionAnswering,
    )
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers.modeling_auto import (
+        MODEL_MAPPING,
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_WITH_LM_HEAD_MAPPING,
+    )


 @require_torch
@@ -127,3 +135,26 @@ class AutoModelTest(unittest.TestCase):
        self.assertIsInstance(model, RobertaForMaskedLM)
        self.assertEqual(model.num_parameters(), 14830)
        self.assertEqual(model.num_parameters(only_trainable=True), 14830)
+
+    def test_parents_and_children_in_mappings(self):
+        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
+        # by the parents and will return the wrong configuration type when using auto models
+
+        mappings = (
+            MODEL_MAPPING,
+            MODEL_FOR_PRETRAINING_MAPPING,
+            MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_WITH_LM_HEAD_MAPPING,
+        )
+
+        for mapping in mappings:
+            mapping = tuple(mapping.items())
+            for index, (child_config, child_model) in enumerate(mapping[1:]):
+                for parent_config, parent_model in mapping[: index + 1]:
+                    with self.subTest(
+                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
+                    ):
+                        self.assertFalse(issubclass(child_config, parent_config))
+                        self.assertFalse(issubclass(child_model, parent_model))
--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -25,6 +25,7 @@ from transformers import (
    GPT2Tokenizer,
    RobertaTokenizer,
 )
+from transformers.tokenization_auto import TOKENIZER_MAPPING

 from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, slow  # noqa: F401

@@ -70,3 +71,19 @@ class AutoTokenizerTest(unittest.TestCase):
        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
            with self.assertRaises(EnvironmentError):
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
+
+    def test_parents_and_children_in_mappings(self):
+        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
+        # by the parents and will return the wrong configuration type when using auto models
+
+        mappings = (TOKENIZER_MAPPING,)
+
+        for mapping in mappings:
+            mapping = tuple(mapping.items())
+            for index, (child_config, child_model) in enumerate(mapping[1:]):
+                for parent_config, parent_model in mapping[: index + 1]:
+                    with self.subTest(
+                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
+                    ):
+                        self.assertFalse(issubclass(child_config, parent_config))
+                        self.assertFalse(issubclass(child_model, parent_model))