Add Flaubert

f0a4fc6c · Hang Le · Lysandre Debut · a5381495 · f0a4fc6c · f0a4fc6c
Commit f0a4fc6c authored Jan 15, 2020 by Hang Le Committed by Lysandre Debut Jan 30, 2020
7 changed files
--- a/README.md
+++ b/README.md
@@ -160,8 +160,9 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-15. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-16. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+16. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+17. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).

--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -41,6 +41,9 @@ from transformers import (
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
+    FlaubertConfig,
+    FlaubertForSequenceClassification,
+    FlaubertTokenizer,
    RobertaConfig,
    RobertaForSequenceClassification,
    RobertaTokenizer,
@@ -80,6 +83,7 @@ ALL_MODELS = sum(
            DistilBertConfig,
            AlbertConfig,
            XLMRobertaConfig,
+            FlaubertConfig,
        )
    ),
    (),
@@ -93,6 +97,7 @@ MODEL_CLASSES = {
    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
+    "flaubert": (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer),
 }
@@ -480,7 +485,7 @@ def main():
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step.",
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",

--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -25,6 +25,7 @@ from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
 from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_mmbt import MMBTConfig
 from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
@@ -108,6 +109,7 @@ from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenize
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
+from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_roberta import RobertaTokenizer
@@ -260,6 +262,15 @@ if is_torch_available():
    )
    from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
+    from .modeling_flaubert import (
+        FlaubertModel,
+        FlaubertWithLMHeadModel,
+        FlaubertForSequenceClassification,
+        FlaubertForQuestionAnswering,
+        FlaubertForQuestionAnsweringSimple,
+        FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
    # Optimization
    from .optimization import (
        AdamW,

--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -23,6 +23,7 @@ from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
 from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
 from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
@@ -53,6 +54,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ]
    for key, value, in pretrained_map.items()
 )
@@ -73,6 +75,7 @@ CONFIG_MAPPING = OrderedDict(
        ("xlnet", XLNetConfig,),
        ("xlm", XLMConfig,),
        ("ctrl", CTRLConfig,),
+        ("flaubert", FlaubertConfig,),
    ]
 )
@@ -126,6 +129,7 @@ class AutoConfig(object):
            - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
            - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
            - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
+            - contains `flaubert` : :class:`~transformers.FlaubertConfig` (Flaubert model)
        Args:

--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flaubert configuration, based on XLM. """
+import logging
+from .configuration_xlm import XLMConfig
+logger = logging.getLogger(__name__)
+FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/config.json",
+    "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/config.json",
+    "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/config.json",
+    "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/config.json",
+}
+class FlaubertConfig(XLMConfig):
+    """Configuration class to store the configuration of a `FlaubertModel`.
+    Args:
+        vocab_size: Vocabulary size of `inputs_ids` in `FlaubertModel`.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for Flaubert, 'uni' for Transformer-XL
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+        dropout: float, dropout rate.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+    """
+    pretrained_config_archive_map = FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+    model_type = "flaubert"
+    def __init__(self, layerdrop=0.0, pre_norm=False, **kwargs):
+        """Constructs FlaubertConfig.
+        """
+        super().__init__(**kwargs)
+        self.layerdrop = layerdrop
+        self.pre_norm = pre_norm
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
--- a/src/transformers/tokenization_flaubert.py
+++ b/src/transformers/tokenization_flaubert.py
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Flaubert, based on XLM."""
+import logging
+import unicodedata
+import six
+from .tokenization_xlm import XLMTokenizer
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json",
+        "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json",
+        "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json",
+        "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json",
+    },
+    "merges_file": {
+        "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt",
+        "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt",
+        "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt",
+        "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "flaubert-small-cased": 512,
+    "flaubert-base-uncased": 512,
+    "flaubert-base-cased": 512,
+    "flaubert-large-cased": 512,
+}
+PRETRAINED_INIT_CONFIGURATION = {
+    "flaubert-small-cased": {"do_lowercase": False},
+    "flaubert-base-uncased": {"do_lowercase": True},
+    "flaubert-base-cased": {"do_lowercase": False},
+    "flaubert-large-cased": {"do_lowercase": False},
+}
+def convert_to_unicode(text):
+    """
+    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
+    """
+    # six_ensure_text is copied from https://github.com/benjaminp/six
+    def six_ensure_text(s, encoding="utf-8", errors="strict"):
+        if isinstance(s, six.binary_type):
+            return s.decode(encoding, errors)
+        elif isinstance(s, six.text_type):
+            return s
+        else:
+            raise TypeError("not expecting type '%s'" % type(s))
+    return six_ensure_text(text, encoding="utf-8", errors="ignore")
+class FlaubertTokenizer(XLMTokenizer):
+    """
+    BPE tokenizer for Flaubert
+        - Moses preprocessing & tokenization
+        - Normalize all inputs text
+        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+        (ex: "__classify__") to a vocabulary
+        - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(self, do_lowercase=False, **kwargs):
+        super().__init__(**kwargs)
+        self.do_lowercase = do_lowercase
+        self.do_lowercase_and_remove_accent = False
+    def preprocess_text(self, text):
+        text = text.replace("``", '"').replace("''", '"')
+        text = convert_to_unicode(text)
+        text = unicodedata.normalize("NFC", text)
+        if self.do_lowercase:
+            text = text.lower()
+        return text
+    def _tokenize(self, text, bypass_tokenizer=False):
+        """
+        Tokenize a string given language code using Moses.
+        Details of tokenization:
+        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+        Args:
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
+        Returns:
+            List of tokens.
+        """
+        lang = "fr"
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
+            )
+        if bypass_tokenizer:
+            text = text.split()
+        else:
+            text = self.preprocess_text(text)
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.moses_tokenize(text, lang=lang)
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+        return split_tokens