Merge branch 'master' into conditional-generation

228cdd6a · Thomas Wolf · GitHub · 3cf2020c · 079bfb32 · 228cdd6a
Unverified Commit 228cdd6a authored Oct 30, 2019 by Thomas Wolf Committed by GitHub Oct 30, 2019
20 changed files
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
-            truncate_first_sequence=True  # We're truncating the first sequence in priority
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -27,7 +27,7 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 try:
    import tensorflow as tf
-    assert int(tf.__version__[0]) >= 2
+    assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
    _tf_available = True  # pylint: disable=invalid-name
    logger.info("TensorFlow version {} available.".format(tf.__version__))
 except (ImportError, AssertionError):
@@ -246,7 +246,7 @@ def http_get(url, temp_file, proxies=None):
    progress.close()
-def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10):
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
@@ -266,12 +266,12 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
        etag = s3_etag(url, proxies=proxies)
    else:
        try:
-            response = requests.head(url, allow_redirects=True, proxies=proxies)
+            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
            if response.status_code != 200:
                etag = None
            else:
                etag = response.headers.get("ETag")
-        except EnvironmentError:
+        except (EnvironmentError, requests.exceptions.Timeout):
            etag = None
    if sys.version_info[0] == 2 and etag is not None:

--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -21,6 +21,7 @@ import logging
 from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
 from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
 from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
+from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
 from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
 from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
 from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
@@ -51,6 +52,7 @@ class AutoModel(object):
            - contains `bert`: BertModel (Bert model)
            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)
            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
            - contains `xlnet`: XLNetModel (XLNet model)
            - contains `xlm`: XLMModel (XLM model)
@@ -73,6 +75,7 @@ class AutoModel(object):
            - contains `bert`: BertModel (Bert model)
            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)
            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
            - contains `xlnet`: XLNetModel (XLNet model)
            - contains `xlm`: XLMModel (XLM model)
@@ -149,10 +152,11 @@ class AutoModel(object):
            return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'ctrl' in pretrained_model_name_or_path:
+            return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path))
 class AutoModelWithLMHead(object):
@@ -172,6 +176,7 @@ class AutoModelWithLMHead(object):
            - contains `bert`: BertForMaskedLM (Bert model)
            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `ctrl`: CTRLLMModel (Salesforce CTRL model)
            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
            - contains `xlm`: XLMWithLMHeadModel (XLM model)
@@ -273,10 +278,11 @@ class AutoModelWithLMHead(object):
            return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'ctrl' in pretrained_model_name_or_path:
+            return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path))
 class AutoModelForSequenceClassification(object):

--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -46,6 +46,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
+    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
 }
@@ -1194,12 +1196,16 @@ class BertForQuestionAnswering(BertPreTrainedModel):
    Examples::
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        start_positions = torch.tensor([1])
+        input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
-        end_positions = torch.tensor([3])
+        input_ids = tokenizer.encode(input_text)
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
-        loss, start_scores, end_scores = outputs[:2]
+        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
+        # a nice puppet
    """
    def __init__(self, config):

--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -159,8 +159,6 @@ class MultiHeadSelfAttention(nn.Module):
        dim_per_head = self.dim // self.n_heads
-        assert 2 <= mask.dim() <= 3
-        causal = (mask.dim() == 3)
        mask_reshp = (bs, 1, 1, k_length)
        def shape(x):

--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -347,6 +347,7 @@ class GPT2Model(GPT2PreTrainedModel):
        super(GPT2Model, self).__init__(config)
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
+        self.output_past = config.output_past
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
@@ -440,7 +441,8 @@ class GPT2Model(GPT2PreTrainedModel):
                            head_mask=head_mask[i])
            hidden_states, present = outputs[:2]
-            presents = presents + (present,)
+            if self.output_past:
+                presents = presents + (present,)
            if self.output_attentions:
                all_attentions.append(outputs[2])
@@ -452,7 +454,9 @@ class GPT2Model(GPT2PreTrainedModel):
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
-        outputs = (hidden_states, presents)
+        outputs = (hidden_states,)
+        if self.output_past:
+            outputs = outputs + (presents,)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
@@ -460,7 +464,7 @@ class GPT2Model(GPT2PreTrainedModel):
            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
            outputs = outputs + (all_attentions,)
-        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
+        return outputs  # last hidden state, (presents), (all hidden_states), (attentions)
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top

--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -170,7 +170,7 @@ class Attention(nn.Module):
        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
        # XD: self.b may be larger than w, so we need to crop it
        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
-        w = w * b + -1e9 * (1 - b)
+        w = w * b + - 1e4 * (1 - b)
        if attention_mask is not None:
            # Apply the attention mask

--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -34,6 +34,7 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
+    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
 }
 class RobertaEmbeddings(BertEmbeddings):
@@ -172,7 +173,8 @@ class RobertaModel(BertModel):
        if input_ids[:, 0].sum().item() != 0:
            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
                           "This model requires special tokens in order to work. "
-                           "Please specify add_special_tokens=True in your encoding.")
+                           "Please specify add_special_tokens=True in your tokenize.encode()"
+                           "or tokenizer.convert_tokens_to_ids().")
        return super(RobertaModel, self).forward(input_ids,
                                                 attention_mask=attention_mask,
                                                 token_type_ids=token_type_ids,
@@ -341,6 +343,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
        return outputs  # (loss), logits, (hidden_states), (attentions)
 @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
@@ -449,6 +452,81 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class RobertaForTokenClassification(BertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaForTokenClassification.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+    def __init__(self, config):
+        super(RobertaForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.roberta = RobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+        outputs = self.roberta(input_ids,
+                               attention_mask=attention_mask,
+                               token_type_ids=token_type_ids,
+                               position_ids=position_ids,
+                               head_mask=head_mask)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        return outputs  # (loss), scores, (hidden_states), (attentions)
 class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -26,6 +26,7 @@ from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSeque
 from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
 from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
 from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
+from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
 from .file_utils import add_start_docstrings
@@ -52,6 +53,7 @@ class TFAutoModel(object):
            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetModel (XLNet model)
            - contains `xlm`: TFXLMModel (XLM model)
+            - contains `ctrl`: TFCTRLModel (CTRL model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
@@ -73,7 +75,7 @@ class TFAutoModel(object):
            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetModel (XLNet model)
-            - contains `xlm`: TFXLMModel (XLM model)
+            - contains `ctrl`: TFCTRLModel (CTRL model)
        Params:
            pretrained_model_name_or_path: either:
@@ -147,10 +149,12 @@ class TFAutoModel(object):
            return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'ctrl' in pretrained_model_name_or_path:
+            return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
 class TFAutoModelWithLMHead(object):
@@ -173,6 +177,7 @@ class TFAutoModelWithLMHead(object):
            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
+            - contains `ctrl`: TFCTRLLMHeadModel (CTRL model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
@@ -198,6 +203,7 @@ class TFAutoModelWithLMHead(object):
            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
+            - contains `ctrl`: TFCTRLLMHeadModel (CTRL model)
        Params:
            pretrained_model_name_or_path: either:
@@ -271,10 +277,12 @@ class TFAutoModelWithLMHead(object):
            return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'ctrl' in pretrained_model_name_or_path:
+            return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
 class TFAutoModelForSequenceClassification(object):

--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -30,7 +30,6 @@ import tensorflow as tf
 from .configuration_bert import BertConfig
 from .modeling_tf_utils import TFPreTrainedModel, get_initializer
 from .file_utils import add_start_docstrings
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
@@ -52,14 +51,6 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 }
-def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
-    # build the network
-    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-    tf_inputs = tf.constant(inputs_list)
-    tfo = tf_model(tf_inputs, training=False)
-    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 def gelu(x):
    """ Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
@@ -545,7 +536,6 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
    """
    config_class = BertConfig
    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_pt_weights = load_bert_pt_weights_in_tf2
    base_model_prefix = "bert"

--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -31,7 +31,6 @@ import tensorflow as tf
 from .configuration_distilbert import DistilBertConfig
 from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
 from .file_utils import add_start_docstrings
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
@@ -66,14 +65,6 @@ def gelu_new(x):
        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf
-def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
-    # build the network
-    inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
-    attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
-    tf_inputs = [inputs_list, attns_list]
-    tfo = tf_model(tf_inputs, training=False)
-    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 class TFEmbeddings(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFEmbeddings, self).__init__(**kwargs)
@@ -226,8 +217,6 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
        dim_per_head = self.dim // self.n_heads
-        assert 2 <= len(tf.shape(mask)) <= 3
-        causal = (len(tf.shape(mask)) == 3)
        mask_reshape = [bs, 1, 1, k_length]
        def shape(x):
@@ -456,7 +445,6 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
    """
    config_class = DistilBertConfig
    pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_pt_weights = load_distilbert_pt_weights_in_tf2
    base_model_prefix = "distilbert"

--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -32,7 +32,6 @@ from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
                                TFSequenceSummary, shape_list, get_initializer)
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
@@ -42,14 +41,6 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models
                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
-def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
-    # build the network
-    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-    tf_inputs = tf.constant(inputs_list)
-    tfo = tf_model(tf_inputs, training=False)
-    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 def gelu(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
@@ -350,7 +341,6 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
    """
    config_class = GPT2Config
    pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_pt_weights = load_gpt2_pt_weights_in_tf2
    base_model_prefix = "transformer"

--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -32,21 +32,12 @@ from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
                                TFSequenceSummary, shape_list, get_initializer)
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
-def load_openai_gpt_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
-    # build the network
-    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-    tf_inputs = tf.constant(inputs_list)
-    tfo = tf_model(tf_inputs, training=False)
-    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 def gelu(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
@@ -335,7 +326,6 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
    """
    config_class = OpenAIGPTConfig
    pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_pt_weights = load_openai_gpt_pt_weights_in_tf2
    base_model_prefix = "transformer"

--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -25,8 +25,6 @@ import numpy
 logger = logging.getLogger(__name__)
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
    """ Convert a TF 2.0 model variable name in a pytorch model weight name.
@@ -105,7 +103,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
        raise e
    if tf_inputs is None:
-        tf_inputs = tf.constant(DUMMY_INPUTS)
+        tf_inputs = tf_model.dummy_inputs
    if tf_inputs is not None:
        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built
@@ -200,7 +198,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
    tf_model = tf_model_class(pt_model.config)
    if tf_inputs is None:
-        tf_inputs = tf.constant(DUMMY_INPUTS)
+        tf_inputs = tf_model.dummy_inputs
    if tf_inputs is not None:
        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built

--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -26,7 +26,6 @@ import tensorflow as tf
 from .configuration_roberta import RobertaConfig
 from .modeling_tf_utils import TFPreTrainedModel, get_initializer
 from .file_utils import add_start_docstrings
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
@@ -36,16 +35,9 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
+    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
 }
-def load_roberta_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
-    # build the network
-    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-    tf_inputs = tf.constant(inputs_list)
-    tfo = tf_model(tf_inputs, training=False)
-    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 class TFRobertaEmbeddings(TFBertEmbeddings):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
@@ -83,7 +75,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
            input_ids = inputs
        if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
-            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
+            tf.print("A sequence with no special tokens has been passed to the RoBERTa model. "
                           "This model requires special tokens in order to work. "
                           "Please specify add_special_tokens=True in your encoding.")
@@ -96,7 +88,6 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
    """
    config_class = RobertaConfig
    pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_pt_weights = load_roberta_pt_weights_in_tf2
    base_model_prefix = "roberta"
@@ -380,3 +371,54 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
        outputs = (logits,) + outputs[2:]
        return outputs  # logits, (hidden_states), (attentions)
+@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import RobertaTokenizer, TFRobertaForTokenClassification
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.roberta = TFRobertaMainLayer(config, name='roberta')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+    def call(self, inputs, **kwargs):
+        outputs = self.roberta(inputs, **kwargs)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        return outputs  # scores, (hidden_states), (attentions)
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -33,7 +33,6 @@ from .configuration_transfo_xl import TransfoXLConfig
 from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
 from .file_utils import add_start_docstrings
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
@@ -41,14 +40,6 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
 }
-def load_transfo_xl_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
-    # build the network
-    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-    tf_inputs = tf.constant(inputs_list)
-    tfo = tf_model(tf_inputs, training=False)
-    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 class TFPositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, demb, **kwargs):
        super(TFPositionalEmbedding, self).__init__(**kwargs)
@@ -577,7 +568,6 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
    """
    config_class = TransfoXLConfig
    pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_pt_weights = load_transfo_xl_pt_weights_in_tf2
    base_model_prefix = "transformer"

--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -25,9 +25,11 @@ import tensorflow as tf
 from .configuration_utils import PretrainedConfig
 from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 class TFPreTrainedModel(tf.keras.Model):
    r""" Base class for all TF models.
@@ -48,8 +50,8 @@ class TFPreTrainedModel(tf.keras.Model):
    """
    config_class = None
    pretrained_model_archive_map = {}
-    load_pt_weights = lambda model, config, path: None
    base_model_prefix = ""
+    dummy_inputs = tf.constant(DUMMY_INPUTS)  # dummy inputs to build the network
    def __init__(self, config, *inputs, **kwargs):
        super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
@@ -262,17 +264,16 @@ class TFPreTrainedModel(tf.keras.Model):
        if from_pt:
            # Load from a PyTorch checkpoint
-            return cls.load_pt_weights(model, resolved_archive_file)
+            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file)
-        inputs = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        ret = model(model.dummy_inputs, training=False)  # build the network with dummy inputs
-        ret = model(inputs, training=False)  # build the network with dummy inputs
        assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
        # 'by_name' allow us to do transfer learning by skipping/adding layers
        # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
        model.load_weights(resolved_archive_file, by_name=True)
-        ret = model(inputs, training=False)  # Make sure restore ops are run
+        ret = model(model.dummy_inputs, training=False)  # Make sure restore ops are run
        return model
@@ -393,26 +394,26 @@ class TFSequenceSummary(tf.keras.layers.Layer):
            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
            raise NotImplementedError
-        self.summary = None
+        self.has_summary = hasattr(config, 'summary_use_proj') and config.summary_use_proj
-        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
+        if self.has_summary:
            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
                num_classes = config.num_labels
            else:
                num_classes = config.hidden_size
            self.summary = tf.keras.layers.Dense(num_classes,
-                                                 kernel_initializer=get_initializer(initializer_range),
+                                                    kernel_initializer=get_initializer(initializer_range),
-                                                 name='summary')
+                                                    name='summary')
-        self.activation = None
+        self.has_activation = hasattr(config, 'summary_activation') and config.summary_activation == 'tanh'
-        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+        if self.has_activation:
            self.activation = tf.keras.activations.tanh
-        self.first_dropout = None
+        self.has_first_dropout = hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0
-        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+        if self.has_first_dropout:
            self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
-        self.last_dropout = None
+        self.has_last_dropout = hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0
-        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+        if self.has_last_dropout:
            self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
    def call(self, inputs, training=False):
@@ -455,17 +456,17 @@ class TFSequenceSummary(tf.keras.layers.Layer):
        elif self.summary_type == 'attn':
            raise NotImplementedError
-        if training and self.first_dropout is not None:
+        if self.has_first_dropout:
-            output = self.first_dropout(output)
+            output = self.first_dropout(output, training=training)
-        if self.summary is not None:
+        if self.has_summary:
            output = self.summary(output)
-        if self.activation is not None:
+        if self.has_activation:
            output = self.activation(output)
-        if training and self.last_dropout is not None:
+        if self.has_last_dropout:
-            output = self.last_dropout(output)
+            output = self.last_dropout(output, training=training)
        return output

--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -25,9 +25,8 @@ import numpy as np
 import tensorflow as tf
 from .configuration_xlm import XLMConfig
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer, DUMMY_INPUTS
 from .file_utils import add_start_docstrings
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
@@ -45,19 +44,6 @@ TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
 }
-def load_xlm_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
-    # build the network
-    inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
-    attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
-    if tf_model.config.use_lang_emb and tf_model.config.n_langs > 1:
-        langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
-    else:
-        langs_list = None
-    tf_inputs = [inputs_list, attns_list, langs_list]
-    tfo = tf_model(tf_inputs, training=False)
-    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 def create_sinusoidal_embeddings(n_pos, dim, out):
    position_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
@@ -441,9 +427,19 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
    """
    config_class = XLMConfig
    pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_pt_weights = load_xlm_pt_weights_in_tf2
    base_model_prefix = "transformer"
+    @property
+    def dummy_inputs(self):
+        # Sometimes XLM has language embeddings so don't forget to build them as well if needed
+        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return [inputs_list, attns_list, langs_list]
 XLM_START_DOCSTRING = r"""    The XLM model was proposed in
    `Cross-lingual Language Model Pretraining`_