[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples

[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)
* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
601d4d69 · Thomas Wolf · GitHub · fd405e9a · 601d4d69 · 601d4d69
Unverified Commit 601d4d69 authored Jun 26, 2020 by Thomas Wolf Committed by GitHub Jun 26, 2020
20 changed files
--- a/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md
+++ b/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md
@@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-
 text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
 question = "What has Huggingface done ?"
-encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
+encoding = tokenizer(question, text, return_tensors="pt")
 input_ids = encoding["input_ids"]
 # default is local attention everywhere

--- a/model_cards/valhalla/t5-base-squad/README.md
+++ b/model_cards/valhalla/t5-base-squad/README.md
@@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad")
 def get_answer(question, context):
  input_text = "question: %s  context: %s </s>" % (question, context)
-  features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
+  features = tokenizer([input_text], return_tensors='pt')
  out = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])

--- a/notebooks/02-transformers.ipynb
+++ b/notebooks/02-transformers.ipynb
@@ -255,7 +255,7 @@
    "# tokens_pt = torch.tensor([tokens_ids])\n",
    "\n",
    "# This code can be factored into one-line as follow\n",
-    "tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n",
+    "tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n",
    "\n",
    "for key, value in tokens_pt2.items():\n",
    "    print(\"{}:\\n\\t{}\".format(key, value))\n",
@@ -268,7 +268,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n",
+    "As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n",
    "that will go through the model. \n",
    "\n",
    "Moreover, you might have noticed it generated some additional tensors: \n",
@@ -302,10 +302,10 @@
   ],
   "source": [
    "# Single segment input\n",
-    "single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n",
+    "single_seg_input = tokenizer(\"This is a sample input\")\n",
    "\n",
    "# Multiple segment input\n",
-    "multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n",
+    "multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n",
    "\n",
    "print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n",
    "print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n",
@@ -344,9 +344,9 @@
   ],
   "source": [
    "# Padding highlight\n",
-    "tokens = tokenizer.batch_encode_plus(\n",
+    "tokens = tokenizer(\n",
    "    [\"This is a sample\", \"This is another longer sample text\"], \n",
-    "    pad_to_max_length=True  # First sentence will have some PADDED tokens to match second sequence length\n",
+    "    padding=True  # First sentence will have some PADDED tokens to match second sequence length\n",
    ")\n",
    "\n",
    "for i in range(2):\n",
@@ -405,8 +405,8 @@
   ],
   "source": [
    "# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
-    "input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n",
+    "input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n",
-    "input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n",
+    "input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n",
    "\n",
    "# Let's compare the outputs\n",
    "output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n",
@@ -464,7 +464,7 @@
    "from transformers import DistilBertModel\n",
    "\n",
    "bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n",
-    "input_pt = tokenizer.encode_plus(\n",
+    "input_pt = tokenizer(\n",
    "    'This is a sample input to demonstrate performance of distiled models especially inference time', \n",
    "    return_tensors=\"pt\"\n",
    ")\n",
@@ -514,7 +514,7 @@
    "de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
    "de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
    "\n",
-    "de_input = de_tokenizer.encode_plus(\n",
+    "de_input = de_tokenizer(\n",
    "    \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n",
    "    return_tensors=\"pt\"\n",
    ")\n",
@@ -559,4 +559,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
\ No newline at end of file
--- a/notebooks/04-onnx-export.ipynb
+++ b/notebooks/04-onnx-export.ipynb
@@ -248,7 +248,7 @@
    "cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n",
    "\n",
    "# Inputs are provided through numpy array\n",
-    "model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n",
+    "model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n",
    "inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n",
    "\n",
    "# Run the model (None = get all the outputs)\n",

--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -86,7 +86,7 @@ def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], D
        print("Found {} {} with shape: {}".format("input" if is_input else "output", name, axes))
        return axes
-    tokens = nlp.tokenizer.encode_plus("This is a sample output", return_tensors=framework)
+    tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
    seq_len = tokens.input_ids.shape[-1]
    outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)

--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -91,7 +91,7 @@ class LineByLineTextDataset(Dataset):
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
-        batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
+        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
    def __len__(self):

--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -137,8 +137,11 @@ def _glue_convert_examples_to_features(
    labels = [label_from_example(example) for example in examples]
-    batch_encoding = tokenizer.batch_encode_plus(
+    batch_encoding = tokenizer(
-        [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True,
+        [(example.text_a, example.text_b) for example in examples],
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
    )
    features = []

--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -120,7 +120,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
    spans = []
-    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
+    truncated_query = tokenizer.encode(
+        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
+    )
    sequence_added_tokens = (
        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
        if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
@@ -131,14 +133,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):
-        encoded_dict = tokenizer.encode_plus(
+        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
            truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
            span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
+            truncation="only_second" if tokenizer.padding_side == "right" else "only_first",
+            padding="max_length",
            max_length=max_seq_length,
            return_overflowing_tokens=True,
-            pad_to_max_length=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
            return_token_type_ids=True,
        )
@@ -176,7 +178,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
        spans.append(encoded_dict)
-        if "overflowing_tokens" not in encoded_dict:
+        if "overflowing_tokens" not in encoded_dict or (
+            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
+        ):
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -278,7 +278,7 @@ PT_MULTIPLE_CHOICE_SAMPLE = r"""
        >>> choice1 = "It is eaten while held in the hand."
        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
        >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels)  # batch size is 1
        >>> # the linear classifier still needs to be trained
@@ -391,7 +391,7 @@ TF_MULTIPLE_CHOICE_SAMPLE = r"""
        >>> choice0 = "It is eaten with a fork and a knife."
        >>> choice1 = "It is eaten while held in the hand."
-        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True)
        >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
        >>> outputs = model(inputs)  # batch size is 1

--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -402,7 +402,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -579,7 +579,7 @@ BERT_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):

--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -251,7 +251,7 @@ CTRL_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.CTRLTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):

--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -360,7 +360,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.DistilBertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
@@ -893,7 +893,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
        >>> choice1 = "It is eaten while held in the hand."
        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-        >>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
        >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
        >>> # the linear classifier still needs to be trained

--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -186,7 +186,7 @@ ELECTRA_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.ElectraTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):

--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -65,7 +65,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.BertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):

--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -302,7 +302,7 @@ GPT2_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__

--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -454,7 +454,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.LonmgformerTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
@@ -970,7 +970,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
        >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        >>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
+        >>> encoding = tokenizer(question, text, return_tensors="pt")
        >>> input_ids = encoding["input_ids"]
        >>> # default is local attention everywhere

--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -678,7 +678,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.MobileBertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):

--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -296,7 +296,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):

--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -1487,7 +1487,7 @@ REFORMER_INPUTS_DOCSTRING = r"""
            Indices can be obtained using :class:`transformers.ReformerTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):