update model doc - swith 3B/11B to 3b/11b

5c00e344 · thomwolf · 110394b2 · 5c00e344 · 5c00e344 · 5c00e344
Commit 5c00e344 authored Dec 13, 2019 by thomwolf
5 changed files
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -217,25 +217,20 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| T5                | ``t5-small``                                               | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+| T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-base``                                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-large``                                               | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-3b``                                                  | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-11b``                                                 | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+



--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -30,8 +30,8 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
-    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-config.json",
-    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-config.json",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
 }



--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -44,8 +44,8 @@ T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
-    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-pytorch_model.bin",
-    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-pytorch_model.bin",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
 }

 ####################################################

--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -34,8 +34,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
-    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-tf_model.h5",
-    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-tf_model.h5",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
 }

 ####################################################

--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -44,8 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
        't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
    }
 }

@@ -56,8 +56,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    't5-small': 512,
    't5-base': 512,
    't5-large': 512,
-    't5-3B': 512,
-    't5-11B': 512,
+    't5-3b': 512,
+    't5-11b': 512,
 }

 class T5Tokenizer(PreTrainedTokenizer):