Add layerdrop

b43cb09a · Hang Le · Lysandre Debut · df27648b · b43cb09a · b43cb09a
Commit b43cb09a authored Jan 30, 2020 by Hang Le Committed by Lysandre Debut Jan 30, 2020
3 changed files
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -51,6 +51,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
 11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.

 .. toctree::
    :maxdepth: 2

--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -45,7 +45,10 @@ class FlaubertConfig(XLMConfig):
        Args:
            pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether to apply the layer normalization before or after the feed forward layer following the
-                attention in each layer.
+                attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+            layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
+                with Structured Dropout. ICLR 2020)
            vocab_size (:obj:`int`, optional, defaults to 30145):
                Vocabulary size of the XLM model. Defines the different tokens that
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.

--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -16,6 +16,7 @@


 import logging
+import random

 import torch
 from torch.nn import functional as F
@@ -113,8 +114,8 @@ class FlaubertModel(XLMModel):

    def __init__(self, config):  # , dico, is_encoder, with_output):
        super(FlaubertModel, self).__init__(config)
-        self.layerdrop = 0.0 if not hasattr(config, "layerdrop") else config.layerdrop
-        self.pre_norm = False if not hasattr(config, "pre_norm") else config.pre_norm
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)

    @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
    def forward(
@@ -243,6 +244,11 @@ class FlaubertModel(XLMModel):
        hidden_states = ()
        attentions = ()
        for i in range(self.n_layers):
+            # LayerDrop
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
            if self.output_hidden_states:
                hidden_states = hidden_states + (tensor,)