Merge branch 'master' into cli

c7be096c · thomwolf · 3492a6ec · 33adab2b · c7be096c · c7be096c
Commit c7be096c authored Dec 19, 2019 by thomwolf
20 changed files
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
-                yield  ({'input_ids': ex.input_ids,
+                yield ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)

--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -18,19 +18,20 @@ if is_tf_available():

 logger = logging.getLogger(__name__)

-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                        orig_answer_text):
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

+
 def _check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""
    best_score = None
@@ -50,10 +51,11 @@ def _check_is_max_context(doc_spans, cur_span_index, position):

    return cur_span_index == best_span_index

+
 def _new_check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""
    # if len(doc_spans) == 1:
-        # return True
+    # return True
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
@@ -71,14 +73,16 @@ def _new_check_is_max_context(doc_spans, cur_span_index, position):

    return cur_span_index == best_span_index

+
 def _is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

-def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training, 
-                                       return_dataset=False):
+
+def squad_convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False
+):
    """
    Converts a list of examples into a list of features that can be directly given as input to a model.
    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
@@ -112,24 +116,23 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
        )
    """

-    # Defining helper methods    
+    # Defining helper methods
    unique_id = 1000000000

    features = []
-    for (example_index, example) in enumerate(tqdm(examples)):
+    for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")):
        if is_training and not example.is_impossible:
            # Get start and end position
            start_position = example.start_position
            end_position = example.end_position

            # If the answer cannot be found in the text, then skip this example.
-            actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
+            actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
            cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                continue

-
        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
@@ -140,7 +143,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

-
        if is_training and not example.is_impossible:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
@@ -153,36 +155,41 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
            )

        spans = []
-        
-        truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
-        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence 
-        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair 
+
+        truncated_query = tokenizer.encode(
+            example.question_text, add_special_tokens=False, max_length=max_query_length
+        )
+        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
+        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair

        span_doc_tokens = all_doc_tokens
        while len(spans) * doc_stride < len(all_doc_tokens):
-            
+
            encoded_dict = tokenizer.encode_plus(
-                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, 
-                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, 
-                max_length=max_seq_length, 
-                return_overflowing_tokens=True, 
+                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
+                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
+                max_length=max_seq_length,
+                return_overflowing_tokens=True,
                pad_to_max_length=True,
                stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-                truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
+                truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
            )

-            paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+            paragraph_len = min(
+                len(all_doc_tokens) - len(spans) * doc_stride,
+                max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+            )

-            if tokenizer.pad_token_id in encoded_dict['input_ids']: 
-                non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+            if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
            else:
-                non_padded_ids = encoded_dict['input_ids']
+                non_padded_ids = encoded_dict["input_ids"]

            tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

            token_to_orig_map = {}
            for i in range(paragraph_len):
-                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i 
+                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
                token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

            encoded_dict["paragraph_len"] = paragraph_len
@@ -202,16 +209,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
        for doc_span_index in range(len(spans)):
            for j in range(spans[doc_span_index]["paragraph_len"]):
                is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-                index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                index = (
+                    j
+                    if tokenizer.padding_side == "left"
+                    else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                )
                spans[doc_span_index]["token_is_max_context"][index] = is_max_context

        for span in spans:
            # Identify the position of the CLS token
-            cls_index = span['input_ids'].index(tokenizer.cls_token_id)
+            cls_index = span["input_ids"].index(tokenizer.cls_token_id)

            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
            # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = np.array(span['token_type_ids'])
+            p_mask = np.array(span["token_type_ids"])

            p_mask = np.minimum(p_mask, 1)

@@ -224,7 +235,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
            # Set the CLS index to '0'
            p_mask[cls_index] = 0

-
            span_is_impossible = example.is_impossible
            start_position = 0
            end_position = 0
@@ -247,55 +257,99 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                        doc_offset = 0
                    else:
                        doc_offset = len(truncated_query) + sequence_added_tokens
-                        
+
                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset

-
-            features.append(SquadFeatures(
-                span['input_ids'],
-                span['attention_mask'],
-                span['token_type_ids'],
-                cls_index,
-                p_mask.tolist(),
-
-                example_index=example_index,
-                unique_id=unique_id,
-                paragraph_len=span['paragraph_len'],
-                token_is_max_context=span["token_is_max_context"],
-                tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"],
-                
-                start_position=start_position,
-                end_position=end_position
-            ))
+            features.append(
+                SquadFeatures(
+                    span["input_ids"],
+                    span["attention_mask"],
+                    span["token_type_ids"],
+                    cls_index,
+                    p_mask.tolist(),
+                    example_index=example_index,
+                    unique_id=unique_id,
+                    paragraph_len=span["paragraph_len"],
+                    token_is_max_context=span["token_is_max_context"],
+                    tokens=span["tokens"],
+                    token_to_orig_map=span["token_to_orig_map"],
+                    start_position=start_position,
+                    end_position=end_position,
+                )
+            )

            unique_id += 1

-    if return_dataset == 'pt':
+    if return_dataset == "pt":
        if not is_torch_available():
            raise ImportError("Pytorch must be installed to return a pytorch dataset.")

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

        if not is_training:
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                    all_example_index, all_cls_index, all_p_mask)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
+            )
        else:
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                    all_start_positions, all_end_positions,
-                                    all_cls_index, all_p_mask)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+            )

        return features, dataset
-        
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for ex in features:
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    }, {
+                        "start_position": ex.start_position,
+                        "end_position": ex.end_position,
+                        "cls_index": ex.cls_index,
+                        "p_mask": ex.p_mask,
+                    }
+                )
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
+                {"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
+            ),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                {
+                    "start_position": tf.TensorShape([]),
+                    "end_position": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                },
+            ),
+        )

    return features

@@ -305,31 +359,32 @@ class SquadProcessor(DataProcessor):
    Processor for the SQuAD data set.
    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
    """
+
    train_file = None
    dev_file = None

    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
        if not evaluate:
-            answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
-            answer_start = tensor_dict['answers']['answer_start'][0].numpy()
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
            answers = []
        else:
-            answers = [{
-                "answer_start": start.numpy(), 
-                "text": text.numpy().decode('utf-8')
-            } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]

            answer = None
            answer_start = None

        return SquadExample(
-            qas_id=tensor_dict['id'].numpy().decode("utf-8"),
-            question_text=tensor_dict['question'].numpy().decode('utf-8'),
-            context_text=tensor_dict['context'].numpy().decode('utf-8'),
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
            answer_text=answer,
            start_position_character=answer_start,
-            title=tensor_dict['title'].numpy().decode('utf-8'),
-            answers=answers
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
        )

    def get_examples_from_dataset(self, dataset, evaluate=False):
@@ -359,7 +414,7 @@ class SquadProcessor(DataProcessor):

        examples = []
        for tensor_dict in tqdm(dataset):
-            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) 
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))

        return examples

@@ -373,10 +428,15 @@ class SquadProcessor(DataProcessor):
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.

        """
+        if data_dir is None:
+            data_dir = ""
+
        if self.train_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")

-        with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
            input_data = json.load(reader)["data"]
        return self._create_examples(input_data, "train")

@@ -389,10 +449,15 @@ class SquadProcessor(DataProcessor):
            filename: None by default, specify this if the evaluation file has a different name than the original one
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
        """
+        if data_dir is None:
+            data_dir = ""
+
        if self.dev_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-        
-        with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
            input_data = json.load(reader)["data"]
        return self._create_examples(input_data, "dev")

@@ -400,7 +465,7 @@ class SquadProcessor(DataProcessor):
        is_training = set_type == "train"
        examples = []
        for entry in tqdm(input_data):
-            title = entry['title']
+            title = entry["title"]
            for paragraph in entry["paragraphs"]:
                context_text = paragraph["context"]
                for qa in paragraph["qas"]:
@@ -409,7 +474,7 @@ class SquadProcessor(DataProcessor):
                    start_position_character = None
                    answer_text = None
                    answers = []
-                    
+
                    if "is_impossible" in qa:
                        is_impossible = qa["is_impossible"]
                    else:
@@ -418,8 +483,8 @@ class SquadProcessor(DataProcessor):
                    if not is_impossible:
                        if is_training:
                            answer = qa["answers"][0]
-                            answer_text = answer['text']
-                            start_position_character = answer['answer_start']
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
                        else:
                            answers = qa["answers"]

@@ -431,12 +496,13 @@ class SquadProcessor(DataProcessor):
                        start_position_character=start_position_character,
                        title=title,
                        is_impossible=is_impossible,
-                        answers=answers
+                        answers=answers,
                    )

                    examples.append(example)
        return examples

+
 class SquadV1Processor(SquadProcessor):
    train_file = "train-v1.1.json"
    dev_file = "dev-v1.1.json"
@@ -445,7 +511,7 @@ class SquadV1Processor(SquadProcessor):
 class SquadV2Processor(SquadProcessor):
    train_file = "train-v2.0.json"
    dev_file = "dev-v2.0.json"
-    
+

 class SquadExample(object):
    """
@@ -462,21 +528,23 @@ class SquadExample(object):
        is_impossible: False by default, set to True if the example has no possible answer.
    """

-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 context_text,
-                 answer_text,
-                 start_position_character,
-                 title,
-                 answers=[],
-                 is_impossible=False):
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
        self.qas_id = qas_id
        self.question_text = question_text
        self.context_text = context_text
        self.answer_text = answer_text
        self.title = title
-        self.is_impossible = is_impossible 
+        self.is_impossible = is_impossible
        self.answers = answers

        self.start_position, self.end_position = 0, 0
@@ -503,7 +571,9 @@ class SquadExample(object):
        # Start end end positions only has a value during evaluation.
        if start_position_character is not None and not is_impossible:
            self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]


 class SquadFeatures(object):
@@ -531,24 +601,23 @@ class SquadFeatures(object):
        end_position: end of the answer token index 
    """

-    def __init__(self,
-                 input_ids,
-                 attention_mask,
-                 token_type_ids,
-                 cls_index,
-                 p_mask,
-                 
-                 example_index,
-                 unique_id,
-                 paragraph_len,
-                 token_is_max_context,
-                 tokens,
-                 token_to_orig_map,
-
-                 start_position,
-                 end_position
-        ):
-        self.input_ids = input_ids 
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+    ):
+        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.cls_index = cls_index
@@ -574,12 +643,13 @@ class SquadResult(object):
        start_logits: The logits corresponding to the start of the answer
        end_logits: The logits corresponding to the end of the answer
    """
+
    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
        self.start_logits = start_logits
        self.end_logits = end_logits
        self.unique_id = unique_id
-        
+
        if start_top_index:
            self.start_top_index = start_top_index
            self.end_top_index = end_top_index
-            self.cls_logits = cls_logits
\ No newline at end of file
+            self.cls_logits = cls_logits
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -21,38 +21,35 @@ import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
 import requests
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from contextlib import contextmanager

 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

-try:
-    os.environ.setdefault('USE_TF', 'YES')
-    if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
-        import tensorflow as tf
-        assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
-        _tf_available = True  # pylint: disable=invalid-name
-        logger.info("TensorFlow version {} available.".format(tf.__version__))
-    else:
-        logger.info("USE_TF override through env variable, disabling Tensorflow")
-        _tf_available = False
-
-except (ImportError, AssertionError):
-    _tf_available = False  # pylint: disable=invalid-name
-
 try:
    os.environ.setdefault('USE_TORCH', 'YES')
    if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'):
        import torch
        _torch_available = True  # pylint: disable=invalid-name
        logger.info("PyTorch version {} available.".format(torch.__version__))
-
    else:
        logger.info("USE_TORCH override through env variable, disabling PyTorch")
        _torch_available = False
 except ImportError:
    _torch_available = False  # pylint: disable=invalid-name

+try:
+    os.environ.setdefault('USE_TF', 'YES')
+    if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
+        import tensorflow as tf
+        assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
+        _tf_available = True  # pylint: disable=invalid-name
+        logger.info("TensorFlow version {} available.".format(tf.__version__))
+    else:
+        logger.info("USE_TF override through env variable, disabling Tensorflow")
+        _tf_available = False
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name

 try:
    from torch.hub import _get_torch_home
@@ -84,6 +81,13 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
+MODEL_CARD_NAME = "model_card.json"
+
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+

 def is_torch_available():
    return _torch_available
@@ -116,6 +120,18 @@ else:
            return fn
        return docstring_decorator

+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ('http', 'https', 's3')
+
+def hf_bucket_url(identifier, postfix=None):
+    if postfix is None:
+        return "/".join((S3_BUCKET_PREFIX, identifier))
+    else:
+        return "/".join((S3_BUCKET_PREFIX, identifier, postfix))
+
+
 def url_to_filename(url, etag=None):
    """
    Convert `url` into a hashed filename in a repeatable way.
@@ -184,9 +200,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ('http', 'https', 's3'):
+    if is_remote_url(url_or_filename):
        # URL, so get it from the cache (downloading if necessary)
        return get_from_cache(url_or_filename, cache_dir=cache_dir,
            force_download=force_download, proxies=proxies,
@@ -194,7 +208,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        return url_or_filename
-    elif parsed.scheme == '':
+    elif urlparse(url_or_filename).scheme == '':
        # File, but it doesn't exist.
        raise EnvironmentError("file {} not found".format(url_or_filename))
    else:
@@ -258,7 +272,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0):
        return
    content_length = response.headers.get('Content-Length')
    total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total, initial=resume_size)
+    progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading")
    for chunk in response.iter_content(chunk_size=1024):
        if chunk: # filter out keep-alive new chunks
            progress.update(len(chunk))

--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -131,8 +131,9 @@ class HfApi:
        # the client still has to specify it when uploading the file.
        with open(filepath, "rb") as f:
            pf = TqdmProgressFileReader(f)
+            data = f if pf.total_size > 0 else ""

-            r = requests.put(urls.write, data=f, headers={
+            r = requests.put(urls.write, data=data, headers={
                "content-type": urls.type,
            })
            r.raise_for_status()

--- a/transformers/model_card.py
+++ b/transformers/model_card.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url
+
+
+logger = logging.getLogger(__name__)
+
+
+class ModelCard(object):
+    r""" Model Card class.
+        Store model card as well as methods for loading/downloading/saving model cards.
+
+        Please read the following paper for details and explanation on the sections:
+            "Model Cards for Model Reporting"
+                by Margaret Mitchell, Simone Wu,
+                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
+                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
+            Link: https://arxiv.org/abs/1810.03993
+
+        Note:
+            A model card can be loaded and saved to disk.
+
+        Parameters:
+    """
+    def __init__(self, **kwargs):
+        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
+        self.model_details = kwargs.pop('model_details', {})
+        self.intended_use = kwargs.pop('intended_use', {})
+        self.factors = kwargs.pop('factors', {})
+        self.metrics = kwargs.pop('metrics', {})
+        self.evaluation_data = kwargs.pop('evaluation_data', {})
+        self.training_data = kwargs.pop('training_data', {})
+        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
+        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
+        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+
+        # Open additional attributes
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
+    def save_pretrained(self, save_directory_or_file):
+        """ Save a model card object to the directory or file `save_directory_or_file`.
+        """
+        if os.path.isdir(save_directory_or_file):
+            # If we save using the predefined names, we can load using `from_pretrained`
+            output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
+        else:
+            output_model_card_file = save_directory_or_file
+
+        self.to_json_file(output_model_card_file)
+        logger.info("Model card saved in {}".format(output_model_card_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/model_card.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                card should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
+
+                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model card file and override the cached version if it exists.
+
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final model card object.
+                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+
+        Examples::
+
+            model_card = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from S3 and cache.
+            model_card = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
+            model_card = ModelCard.from_pretrained('./test/saved_model/model_card.json')
+            model_card = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
+        proxies = kwargs.pop('proxies', None)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            # For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json)
+            model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+            model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
+        elif os.path.isdir(pretrained_model_name_or_path):
+            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            model_card_file = pretrained_model_name_or_path
+        else:
+            model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download,
+                                               proxies=proxies, resume_download=resume_download)
+            if resolved_model_card_file == model_card_file:
+                logger.info("loading model card file {}".format(model_card_file))
+            else:
+                logger.info("loading model card file {} from cache at {}".format(
+                    model_card_file, resolved_model_card_file))
+            # Load model card
+            model_card = cls.from_json_file(resolved_model_card_file)
+
+        except EnvironmentError:
+            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
+                        model_card_file))
+            else:
+                logger.warning("Model name '{}' was not found in model name list ({}). " \
+                      "We assumed '{}' was a path or url to a model card file named {} or " \
+                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        model_card_file, MODEL_CARD_NAME))
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            model_card = cls()
+
+        except json.JSONDecodeError:
+            logger.warning("Couldn't reach server at '{}' to download model card file or "
+                           "model card file is not a valid JSON file. "
+                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            model_card = cls()
+
+        # Update model card with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(model_card, key):
+                setattr(model_card, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model card: %s", str(model_card))
+        if return_unused_kwargs:
+            return model_card, kwargs
+        else:
+            return model_card
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `ModelCard` from a Python dictionary of parameters."""
+        return cls(**json_object)
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `ModelCard` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -23,21 +23,24 @@ from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRL
                                 TransfoXLConfig, XLMConfig, XLNetConfig)

 from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
-    BertForTokenClassification
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
-from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
+    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
-    XLNetForTokenClassification
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+    XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
+    XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
-    RobertaForTokenClassification
+    RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
-    DistilBertForSequenceClassification, DistilBertForTokenClassification
+    DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
-    CamembertForMultipleChoice, CamembertForTokenClassification
-from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
+    CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
+    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP

 from .modeling_utils import PreTrainedModel, SequenceSummary

@@ -46,6 +49,24 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)


+ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class AutoModel(object):
    r"""
        :class:`~transformers.AutoModel` is a generic model class
@@ -58,6 +79,7 @@ class AutoModel(object):

        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
            - contains `distilbert`: DistilBertModel (DistilBERT model)
            - contains `albert`: AlbertModel (ALBERT model)
            - contains `camembert`: CamembertModel (CamemBERT model)
@@ -130,6 +152,7 @@ class AutoModel(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
            - contains `distilbert`: DistilBertModel (DistilBERT model)
            - contains `albert`: AlbertModel (ALBERT model)
            - contains `camembert`: CamembertModel (CamemBERT model)
@@ -149,6 +172,7 @@ class AutoModel(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -201,7 +225,9 @@ class AutoModel(object):
            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -240,6 +266,7 @@ class AutoModelWithLMHead(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
            - contains `albert`: AlbertForMaskedLM (ALBERT model)
            - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
@@ -311,6 +338,7 @@ class AutoModelWithLMHead(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
            - contains `albert`: AlbertForMaskedLM (ALBERT model)
            - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
@@ -330,6 +358,7 @@ class AutoModelWithLMHead(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -381,7 +410,9 @@ class AutoModelWithLMHead(object):
            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -494,6 +525,7 @@ class AutoModelForSequenceClassification(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -642,6 +674,7 @@ class AutoModelForQuestionAnswering(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -818,10 +851,10 @@ class AutoModelForTokenClassification:
            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
-            return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)


--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -48,6 +48,12 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
    'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
    'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }


@@ -1233,9 +1239,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
        input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
        input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
        # a nice puppet


--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -59,12 +59,14 @@ class PreTrainedEncoderDecoder(nn.Module):
            encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

@@ -217,9 +219,7 @@ class PreTrainedEncoderDecoder(nn.Module):
        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
        if encoder_hidden_states is None:
            encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[
-                0
-            ]  # output the last layer hidden state
+            encoder_hidden_states = encoder_outputs[0]
        else:
            encoder_outputs = ()


--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    """
    def __init__(self, config):
        super(GPT2DoubleHeadsModel, self).__init__(config)
+        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)

--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -590,6 +590,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)

+        config.num_labels = 1
        self.transformer = OpenAIGPTModel(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)

--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
+# coding=utf-8
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+import copy
+import itertools
+from io import open
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_t5 import T5Config
+from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# This dict contrains shortcut names and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
+}
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
+            continue
+        if '_slot_' in name[-1]:
+            logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] in ['kernel', 'scale', 'embedding']:
+                pointer = getattr(pointer, 'weight')
+            # elif l[0] == 'scale':
+            #     pointer = getattr(pointer, 'weight')
+            # elif l[0] == 'output_bias' or l[0] == 'beta':
+            #     pointer = getattr(pointer, 'bias')
+            # elif l[0] == 'squad':
+            #     pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if l[0] not in ['kernel', 'scale', 'embedding']:
+            pointer = getattr(pointer, 'weight')
+        if l[0] != 'embedding':
+            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
+####################################################
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """ Construct a layernorm module in the T5 style
+            No bias and no substraction of mean.
+        """
+        super(T5LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x / torch.sqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+
+class T5DenseReluDense(nn.Module):
+    def __init__(self, config):
+        super(T5DenseReluDense, self).__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        h = self.wi(hidden_states)
+        h = F.relu(h)
+        h = self.dropout(h)
+        h = self.wo(h)
+        return h
+
+
+class T5LayerFF(nn.Module):
+    def __init__(self, config):
+        super(T5LayerFF, self).__init__()
+        self.DenseReluDense = T5DenseReluDense(config)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        norm_x = self.layer_norm(hidden_states)
+        y = self.DenseReluDense(norm_x)
+        layer_output = hidden_states + self.dropout(y)
+        return layer_output
+
+
+class T5Attention(nn.Module):
+    NEW_ID = itertools.count()
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5Attention, self).__init__()
+        self.layer_id = next(T5Attention.NEW_ID)
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.output_attentions = config.output_attentions
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.d_kv = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.d_kv
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, self.d_kv)
+        heads = set(heads) - self.pruned_heads
+        for head in heads:
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.d_kv * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger buckets
+        for larger absolute relative_positions.  All relative positions >=max_distance
+        map to the same bucket.  All relative positions <=-max_distance map to the
+        same bucket.  This should allow for more graceful generalization to longer
+        sequences than the model has been trained on.
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+        # now n is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = (n < max_exact)
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact)
+            / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    def compute_bias(self, qlen, klen):
+        """ Compute binned relative position bias """
+        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
+        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(relative_position,  # shape (qlen, klen)
+                                                   bidirectional=not self.is_decoder,
+                                                   num_buckets=self.relative_attention_num_buckets)
+        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
+        return values
+
+    def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = kv.size(1)
+
+        def shape(x):
+            """  projection """
+            return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
+
+        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
+        scores = torch.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                raise ValueError("No position_bias provided and no weights to compute position_bias")
+            position_bias = self.compute_bias(qlen, klen)
+            if mask is not None:
+                position_bias = position_bias + mask                          # (bs, n_heads, qlen, klen)
+
+        scores += position_bias
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        context = self.o(context)
+
+        outputs = (context,)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        if self.has_relative_attention_bias:
+            outputs = outputs + (position_bias,)
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5LayerSelfAttention, self).__init__()
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(norm_x,
+                                              mask=attention_mask,
+                                              position_bias=position_bias,
+                                              head_mask=head_mask)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5LayerCrossAttention, self).__init__()
+        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(norm_x,
+                                                mask=attention_mask,
+                                                kv=kv,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5Block, self).__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+            self.layer.append(T5LayerFF(config))
+        else:
+            self.layer.append(T5LayerFF(config))
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None,
+                encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
+                head_mask=None):
+        self_attention_outputs = self.layer[0](hidden_states,
+                                                attention_mask=attention_mask,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask)
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
+
+        if not self.is_decoder:
+            hidden_states = self.layer[1](hidden_states)
+        else:
+            cross_attention_outputs = self.layer[1](hidden_states,
+                                                    kv=encoder_hidden_states,
+                                                    attention_mask=encoder_attention_mask,
+                                                    position_bias=encoder_decoder_position_bias,
+                                                    head_mask=head_mask)
+            hidden_states = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # Keep cross-attention outputs and relative position weights
+            hidden_states = self.layer[2](hidden_states)
+
+        outputs = (hidden_states,) + outputs  # add attentions if we output them
+        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = T5Config
+    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {'decoder_input_ids': input_ids,
+                        'encoder_input_ids': input_ids,
+                        'decoder_attention_mask': input_mask}
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor*1.0)
+        elif isinstance(module, (T5Model, T5WithLMHeadModel)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            d_kv = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
+
+
+class T5Stack(T5PreTrainedModel):
+    def __init__(self, config):
+        super(T5Stack, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.is_decoder = config.is_decoder
+
+        self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
+                                    for i in range(config.num_layers)])
+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        self.init_weights()
+
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                head_mask=None):
+
+        batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
+        if self.is_decoder and encoder_attention_mask is None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                seq_ids = torch.arange(seq_length, device=hidden_states.device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                causal_mask = causal_mask.to(attention_mask)
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -1e9 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+        # extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
+
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
+
+        if self.is_decoder:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            if encoder_attention_mask.dim() == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if encoder_attention_mask.dim() == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
+
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_layers
+
+        all_hidden_states = ()
+        all_attentions = ()
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(hidden_states)
+        for i, layer_module in enumerate(self.block):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states,
+                                         attention_mask=extended_attention_mask,
+                                         position_bias=position_bias,
+                                         encoder_hidden_states=encoder_hidden_states,
+                                         encoder_attention_mask=encoder_extended_attention_mask,
+                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
+                                         head_mask=head_mask[i])
+            # layer_outputs is a tuple with:
+            # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+            hidden_states = layer_outputs[0]
+            if i == 0:
+                # We share the position biases between the layers - the first layer store them
+                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+                position_bias = layer_outputs[2 if self.output_attentions else 1]
+                if self.is_decoder:
+                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)  # We keep only self-attention weights for now
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        layer_output = self.dropout(hidden_states)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+T5_START_DOCSTRING = r"""    The T5 model was proposed in
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
+    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
+        https://arxiv.org/abs/1910.10683
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+
+            T5 is a model with relative position embeddings so you should be able to pad the inputs on
+            the right or the left.
+
+            Indices can be obtained using :class:`transformers.T5Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
+                      "without any specific head on top.",
+                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class T5Model(T5PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = T5Model.from_pretrained('t5-small')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids=input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(T5Model, self).__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = T5Stack(encoder_config)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = T5Stack(decoder_config)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self, **kwargs):
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            if encoder_attention_mask is not None:
+                # Apply masking
+                encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states)
+                hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1)
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        return decoder_outputs + encoder_outputs
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
+    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class T5WithLMHeadModel(T5PreTrainedModel):
+    r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = T5WithLMHeadModel.from_pretrained('t5-small')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids=input_ids, lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(T5WithLMHeadModel, self).__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = T5Stack(encoder_config)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = T5Stack(decoder_config)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def forward(self, **kwargs):
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
+
+        lm_labels = kwargs.pop('decoder_lm_labels', None)
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        sequence_output = decoder_outputs[0]
+        # Rescale output before projecting on vocab
+        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+        sequence_output = sequence_output * (self.model_dim ** -0.5)
+        lm_logits = self.lm_head(sequence_output)
+
+        decoder_outputs = (lm_logits,) + decoder_outputs[1:]  # Add hidden states and attention if they are here
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            decoder_outputs = (loss,) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        return decoder_outputs + encoder_outputs
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -587,8 +587,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
        import tensorflow as tf
        from transformers import AlbertTokenizer, TFAlbertModel

-        tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFAlbertModel.from_pretrained('bert-base-uncased')
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+        model = TFAlbertModel.from_pretrained('albert-base-v1')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -23,22 +23,43 @@ from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig,
                                 TransfoXLConfig, XLMConfig, XLNetConfig)

 from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
-    TFBertForQuestionAnswering, TFBertForTokenClassification
-from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
-from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
-from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
+    TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
-    TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification
-from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
-from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification
-from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification
-from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
+    TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \
+    TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \
+    TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP

 from .file_utils import add_start_docstrings

 logger = logging.getLogger(__name__)


+TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class TFAutoModel(object):
    r"""
        :class:`~transformers.TFAutoModel` is a generic model class
@@ -51,6 +72,7 @@ class TFAutoModel(object):

        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFBertModel (Bert model)
@@ -117,6 +139,7 @@ class TFAutoModel(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFTFBertModel (Bert model)
@@ -130,6 +153,7 @@ class TFAutoModel(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.

@@ -185,8 +209,12 @@ class TFAutoModel(object):
            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -221,6 +249,7 @@ class TFAutoModelWithLMHead(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -290,6 +319,7 @@ class TFAutoModelWithLMHead(object):

        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -304,6 +334,7 @@ class TFAutoModelWithLMHead(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.

@@ -359,8 +390,12 @@ class TFAutoModelWithLMHead(object):
            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -461,6 +496,7 @@ class TFAutoModelForSequenceClassification(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.

@@ -518,6 +554,8 @@ class TFAutoModelForSequenceClassification(object):
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -604,6 +642,7 @@ class TFAutoModelForQuestionAnswering(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.


--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -48,6 +48,12 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
 }


@@ -129,7 +135,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.
-        
+
        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        """
@@ -148,7 +154,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
            input_shape = shape_list(input_ids)
        else:
            input_shape = shape_list(inputs_embeds)[:-1]
-        
+
        seq_length = input_shape[1]
        if position_ids is None:
            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
@@ -246,7 +252,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
        context_layer = tf.matmul(attention_probs, value_layer)

        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer, 
+        context_layer = tf.reshape(context_layer,
                                  (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)

        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
@@ -591,7 +597,7 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`

    Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -605,13 +611,13 @@ BERT_INPUTS_DOCSTRING = r"""
            (a) For sequence pairs:

                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``

            (b) For single sequences:

                ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                ``token_type_ids:   0   0   0   0  0     0   0``

            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on

--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
        self.transformer = TFGPT2MainLayer(config, name='transformer')
        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')


--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -538,6 +538,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')


--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
    logger.info("Loading PyTorch weights from {}".format(pt_path))

    pt_state_dict = torch.load(pt_path, map_location='cpu')
+    logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))

    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)

@@ -134,7 +135,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
        start_prefix_to_remove = tf_model.base_model_prefix + '.'

    symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
-
+    tf_loaded_numel = 0
    weight_value_tuples = []
    all_pytorch_weights = set(list(pt_state_dict.keys()))
    for symbolic_weight in symbolic_weights:
@@ -142,7 +143,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
        name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)

        # Find associated numpy array in pytorch model state dict
-        assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
+        if name not in pt_state_dict:
+            if allow_missing_keys:
+                continue
+            raise AttributeError("{} not found in PyTorch model".format(name))
+
        array = pt_state_dict[name].numpy()

        if transpose:
@@ -159,7 +164,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
            e.args += (symbolic_weight.shape, array.shape)
            raise e

-        logger.info("Initialize TF weight {}".format(symbolic_weight.name))
+        tf_loaded_numel += array.size
+        # logger.warning("Initialize TF weight {}".format(symbolic_weight.name))

        weight_value_tuples.append((symbolic_weight, array))
        all_pytorch_weights.discard(name)
@@ -169,6 +175,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
    if tf_inputs is not None:
        tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run

+    logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
+
    logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))

    return tf_model
@@ -246,6 +254,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F

    all_tf_weights = set(list(tf_weights_map.keys()))
    loaded_pt_weights_data_ptr = {}
+    missing_keys_pt = []
    for pt_weight_name, pt_weight in current_pt_params_dict.items():
        # Handle PyTorch shared weight ()not duplicated in TF 2.0
        if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
@@ -254,7 +263,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F

        # Find associated numpy array in pytorch model state dict
        if pt_weight_name not in tf_weights_map:
-            raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
+            if allow_missing_keys:
+                missing_keys_pt.append(pt_weight_name)
+                continue
+            raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name))

        array, transpose = tf_weights_map[pt_weight_name]

@@ -272,13 +284,14 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
            e.args += (pt_weight.shape, array.shape)
            raise e

-        logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
+        # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name))

        new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
        loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
        all_tf_weights.discard(pt_weight_name)

    missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
+    missing_keys += missing_keys_pt

    if len(missing_keys) > 0:
        logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(

--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
+# coding=utf-8
+# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 T5 model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import math
+import copy
+import itertools
+
+import tensorflow as tf
+
+from .configuration_t5 import T5Config
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
+from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
+
+logger = logging.getLogger(__name__)
+
+TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
+}
+
+####################################################
+# TF 2.0 Models are constructed using Keras imperative API by sub-classing
+# - tf.keras.layers.Layer for the layers and
+# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
+####################################################
+
+class TFT5LayerNorm(tf.keras.layers.Layer):
+    def __init__(self, epsilon=1e-6, **kwargs):
+        """ Construct a layernorm module in the T5 style
+            No bias and no substraction of mean.
+        """
+        super(TFT5LayerNorm, self).__init__(**kwargs)
+        self.variance_epsilon = epsilon
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        self.weight = self.add_weight(
+            "weight",
+            shape=(input_shape[-1],),
+            initializer='ones')
+        super(TFT5LayerNorm, self).build(input_shape)
+
+    def call(self, x):
+        variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
+        x = x * tf.math.rsqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+
+class TFT5DenseReluDense(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5DenseReluDense, self).__init__(**kwargs)
+        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi')
+        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+        self.act = tf.keras.activations.relu
+
+    def call(self, hidden_states, training=False):
+        h = self.wi(hidden_states)
+        h = self.act(h)
+        h = self.dropout(h, training=training)
+        h = self.wo(h)
+        return h
+
+
+class TFT5LayerFF(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5LayerFF, self).__init__(**kwargs)
+        self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        y = self.DenseReluDense(norm_x, training=training)
+        layer_output = hidden_states + self.dropout(y, training=training)
+        return layer_output
+
+
+class TFT5Attention(tf.keras.layers.Layer):
+    NEW_ID = itertools.count()
+
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5Attention, self).__init__(**kwargs)
+        self.layer_id = next(TFT5Attention.NEW_ID)
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.output_attentions = config.output_attentions
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.d_kv = config.d_kv
+        self.n_heads = config.num_heads
+        self.inner_dim = self.n_heads * self.d_kv
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q')
+        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k')
+        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v')
+        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets,
+                                                                     self.n_heads,
+                                                                     name='relative_attention_bias')
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger buckets
+        for larger absolute relative_positions.  All relative positions >=max_distance
+        map to the same bucket.  All relative positions <=-max_distance map to the
+        same bucket.  This should allow for more graceful generalization to longer
+        sequences than the model has been trained on.
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets
+            n = tf.math.abs(n)
+        else:
+            n = tf.math.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = tf.math.less(n, max_exact)
+        val_if_large = max_exact + tf.dtypes.cast(
+            tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
+            / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32)
+        val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
+        ret += tf.where(is_small, n, val_if_large)
+        return ret
+
+    def compute_bias(self, qlen, klen):
+        """ Compute binned relative position bias """
+        context_position = tf.range(qlen)[:, None]
+        memory_position = tf.range(klen)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(relative_position,
+                                                   bidirectional=not self.is_decoder,
+                                                   num_buckets=self.relative_attention_num_buckets)
+        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
+        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
+        return values
+
+    def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = shape_list(kv)[1]
+
+        def shape(x):
+            """  projection """
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """  compute context """
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
+
+        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
+        # scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
+        scores = tf.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                raise ValueError("No position_bias provided and no weights to compute position_bias")
+            position_bias = self.compute_bias(qlen, klen)
+            if mask is not None:
+                position_bias = position_bias + mask
+                # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
+                # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
+
+        scores += position_bias
+        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        context = self.o(context)
+
+        outputs = (context,)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        if self.has_relative_attention_bias:
+            outputs = outputs + (position_bias,)
+        return outputs
+
+
+class TFT5LayerSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5LayerSelfAttention, self).__init__(**kwargs)
+        self.SelfAttention = TFT5Attention(config,
+                                           has_relative_attention_bias=has_relative_attention_bias,
+                                           name='SelfAttention')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, attention_mask=None, position_bias=None,
+             head_mask=None, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(norm_x,
+                                              mask=attention_mask,
+                                              position_bias=position_bias,
+                                              head_mask=head_mask,
+                                              training=training)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y, training=training)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFT5LayerCrossAttention(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5LayerCrossAttention, self).__init__(**kwargs)
+        self.EncDecAttention = TFT5Attention(config,
+                                           has_relative_attention_bias=has_relative_attention_bias,
+                                           name='EncDecAttention')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
+             head_mask=None, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(norm_x,
+                                                mask=attention_mask,
+                                                kv=kv,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask,
+                                                training=training)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y, training=training)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFT5Block(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5Block, self).__init__(**kwargs)
+        self.is_decoder = config.is_decoder
+        self.layer = []
+        self.layer.append(TFT5LayerSelfAttention(config,
+                                                 has_relative_attention_bias=has_relative_attention_bias,
+                                                 name='layer_._0'))
+        if self.is_decoder:
+            self.layer.append(TFT5LayerCrossAttention(config,
+                                                      has_relative_attention_bias=has_relative_attention_bias,
+                                                      name='layer_._1'))
+            self.layer.append(TFT5LayerFF(config, name='layer_._2'))
+        else:
+            self.layer.append(TFT5LayerFF(config, name='layer_._1'))
+
+    def call(self, hidden_states, attention_mask=None, position_bias=None,
+             encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
+             head_mask=None, training=False):
+        self_attention_outputs = self.layer[0](hidden_states,
+                                                attention_mask=attention_mask,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask,
+                                                training=training)
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        if not self.is_decoder:
+            hidden_states = self.layer[1](hidden_states, training=training)
+        else:
+            cross_attention_outputs = self.layer[1](hidden_states,
+                                                    kv=encoder_hidden_states,
+                                                    attention_mask=encoder_attention_mask,
+                                                    position_bias=encoder_decoder_position_bias,
+                                                    head_mask=head_mask,
+                                                    training=training)
+            hidden_states = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]
+            hidden_states = self.layer[2](hidden_states, training=training)
+
+        outputs = (hidden_states,) + outputs  # add attentions if we output them
+        return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+
+
+####################################################
+# The full model without a specific pretrained or finetuning head is
+# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
+####################################################
+class TFT5MainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5MainLayer, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.is_decoder = config.is_decoder
+        self.config = config
+        self.num_hidden_layers = config.num_layers
+
+        self.block = [TFT5Block(config,
+                                has_relative_attention_bias=bool(i == 0),
+                                name='block_._{}'.format(i))
+                        for i in range(config.num_layers)]
+        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                              name='final_layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None,
+             encoder_attention_mask=None, head_mask=None, training=False):
+
+        batch_size, seq_length = shape_list(hidden_states)[:2]
+        if attention_mask is None:
+            attention_mask = tf.fill((batch_size, seq_length), 1)
+        if self.is_decoder and encoder_attention_mask is None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
+        num_dims_attention_mask = len(shape_list(attention_mask))
+        if num_dims_attention_mask == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif num_dims_attention_mask == 2:
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                seq_ids = tf.range(seq_length)
+                causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)),
+                                            seq_ids[None, :, None])
+                causal_mask = tf.cast(causal_mask, dtype=tf.float32)
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+        # extended_attention_mask = tf.math.equal(extended_attention_mask,
+        #                                         tf.transpose(extended_attention_mask, perm=(-1, -2)))
+
+        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
+
+        if self.is_decoder:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        all_hidden_states = ()
+        all_attentions = ()
+        position_bias = None
+        encoder_decoder_position_bias = None
+        for i, layer_module in enumerate(self.block):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states,
+                                         attention_mask=extended_attention_mask,
+                                         position_bias=position_bias,
+                                         encoder_hidden_states=encoder_hidden_states,
+                                         encoder_attention_mask=encoder_extended_attention_mask,
+                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
+                                         head_mask=head_mask[i],
+                                         training=training)
+            hidden_states = layer_outputs[0]
+            if i == 0:
+                # We share the position biases between the layers - the first layer store them
+                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+                position_bias = layer_outputs[2 if self.output_attentions else 1]
+                if self.is_decoder:
+                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        layer_output = self.dropout(hidden_states, training=training)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+####################################################
+# TFT5PreTrainedModel is a sub-class of tf.keras.Model
+# which take care of loading and saving pretrained weights
+# and various common utilities.
+# Here you just need to specify a few (self-explanatory)
+# pointers for your model.
+####################################################
+class TFT5PreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = T5Config
+    pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = tf.constant(DUMMY_INPUTS)
+        input_mask = tf.constant(DUMMY_MASK)
+        dummy_inputs = {'decoder_input_ids': input_ids,
+                        'encoder_input_ids': input_ids,
+                        'decoder_attention_mask': input_mask}
+        return dummy_inputs
+
+
+T5_START_DOCSTRING = r"""    The T5 model was proposed in
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
+    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
+        https://arxiv.org/abs/1910.10683
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+
+
+            T5 is a model with relative position embeddings so you should be able to pad the inputs on
+            the right or the left.
+
+            Indices can be obtained using :class:`transformers.T5Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
+                      "without any specific head on top.",
+                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class TFT5Model(TFT5PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import T5Tokenizer, TFT5Model
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = TFT5Model.from_pretrained('t5-small')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids=input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFT5Model, self).__init__(config, *inputs, **kwargs)
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
+                                         name='shared')
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def get_output_embeddings(self):
+        return self.shared
+
+    def call(self, decoder_input_ids, **kwargs):
+        # We allow two types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # The last option is useful to use the tf.keras fit() method.
+
+        if isinstance(decoder_input_ids, dict):
+            kwargs.update(decoder_input_ids)
+        else:
+            kwargs['decoder_input_ids'] = decoder_input_ids
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        return decoder_outputs + encoder_outputs
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
+    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class TFT5WithLMHeadModel(TFT5PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import T5Tokenizer, TFT5WithLMHeadModel
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = TFT5WithLMHeadModel.from_pretrained('t5-small')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids=input_ids)
+        prediction_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
+        self.model_dim = config.d_model
+
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
+                                         name='shared')
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def get_output_embeddings(self):
+        return self.shared
+
+    def call(self, decoder_input_ids, **kwargs):
+        # We allow two types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # The last option is useful to use the tf.keras fit() method.
+
+        if isinstance(decoder_input_ids, dict):
+            kwargs.update(decoder_input_ids)
+        else:
+            kwargs['decoder_input_ids'] = decoder_input_ids
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
+        lm_logits = self.shared(sequence_output, mode="linear")
+        decoder_outputs = (lm_logits,) + decoder_outputs[1:]
+
+        return decoder_outputs + encoder_outputs
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states

-        self.n_token = config.n_token
+        self.n_token = config.vocab_size

        self.d_embed = config.d_embed
        self.d_model = config.d_model
@@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        self.d_head = config.d_head
        self.untie_r = config.untie_r

-        self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+        self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, 
                                            div_val=config.div_val, init_std=config.init_std, name='word_emb')

        self.drop = tf.keras.layers.Dropout(config.dropout)
@@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            raise NotImplementedError
        # use adaptive softmax (including standard softmax)
        else:
-            self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model, 
+            self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, 
                                              config.cutoffs, div_val=config.div_val, name='crit')

    def reset_length(self, tgt_len, ext_len, mem_len):

--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -25,15 +25,15 @@ import tensorflow as tf
 from .modeling_tf_utils import shape_list

 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
                 keep_order=False, **kwargs):
        super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)

-        self.n_token = n_token
+        self.vocab_size = vocab_size
        self.d_embed = d_embed
        self.d_proj = d_proj

-        self.cutoffs = cutoffs + [n_token]
+        self.cutoffs = cutoffs + [vocab_size]
        self.cutoff_ends = [0] + self.cutoffs
        self.div_val = div_val

@@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                    self.out_projs.append(weight)
                else:
                    self.out_projs.append(None)
-                weight = self.add_weight(shape=(self.n_token, self.d_embed,),
+                weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
                                         initializer='zeros',
                                         trainable=True,
                                         name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(self.n_token,),
+                bias = self.add_weight(shape=(self.vocab_size,),
                                         initializer='zeros',
                                         trainable=True,
                                         name='out_layers_._{}_._bias'.format(i))
@@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
        hidden, target = inputs
        head_logprob = 0
        if self.n_clusters == 0:
-            softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
+            softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
            output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
            if target is not None:
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)