Unverified Commit 601d4d69 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors

* style

* fix tests - better type checking error messages

* better type checking

* include awesome fix by @LysandreJik for #5310

* updated doc and examples
parent fd405e9a
......@@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-
text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
question = "What has Huggingface done ?"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
encoding = tokenizer(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]
# default is local attention everywhere
......
......@@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad")
def get_answer(question, context):
input_text = "question: %s context: %s </s>" % (question, context)
features = tokenizer.batch_encode_plus([input_text], return_tensors='pt')
features = tokenizer([input_text], return_tensors='pt')
out = model.generate(input_ids=features['input_ids'],
attention_mask=features['attention_mask'])
......
......@@ -255,7 +255,7 @@
"# tokens_pt = torch.tensor([tokens_ids])\n",
"\n",
"# This code can be factored into one-line as follow\n",
"tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n",
"tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n",
"\n",
"for key, value in tokens_pt2.items():\n",
" print(\"{}:\\n\\t{}\".format(key, value))\n",
......@@ -268,7 +268,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n",
"As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n",
"that will go through the model. \n",
"\n",
"Moreover, you might have noticed it generated some additional tensors: \n",
......@@ -302,10 +302,10 @@
],
"source": [
"# Single segment input\n",
"single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n",
"single_seg_input = tokenizer(\"This is a sample input\")\n",
"\n",
"# Multiple segment input\n",
"multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n",
"multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n",
"\n",
"print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n",
"print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n",
......@@ -344,9 +344,9 @@
],
"source": [
"# Padding highlight\n",
"tokens = tokenizer.batch_encode_plus(\n",
"tokens = tokenizer(\n",
" [\"This is a sample\", \"This is another longer sample text\"], \n",
" pad_to_max_length=True # First sentence will have some PADDED tokens to match second sequence length\n",
" padding=True # First sentence will have some PADDED tokens to match second sequence length\n",
")\n",
"\n",
"for i in range(2):\n",
......@@ -405,8 +405,8 @@
],
"source": [
"# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
"input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n",
"input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n",
"input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n",
"input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n",
"\n",
"# Let's compare the outputs\n",
"output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n",
......@@ -464,7 +464,7 @@
"from transformers import DistilBertModel\n",
"\n",
"bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n",
"input_pt = tokenizer.encode_plus(\n",
"input_pt = tokenizer(\n",
" 'This is a sample input to demonstrate performance of distiled models especially inference time', \n",
" return_tensors=\"pt\"\n",
")\n",
......@@ -514,7 +514,7 @@
"de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
"de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
"\n",
"de_input = de_tokenizer.encode_plus(\n",
"de_input = de_tokenizer(\n",
" \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n",
" return_tensors=\"pt\"\n",
")\n",
......
......@@ -248,7 +248,7 @@
"cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n",
"\n",
"# Inputs are provided through numpy array\n",
"model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n",
"model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n",
"inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n",
"\n",
"# Run the model (None = get all the outputs)\n",
......
......@@ -86,7 +86,7 @@ def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], D
print("Found {} {} with shape: {}".format("input" if is_input else "output", name, axes))
return axes
tokens = nlp.tokenizer.encode_plus("This is a sample output", return_tensors=framework)
tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
seq_len = tokens.input_ids.shape[-1]
outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
......
......@@ -91,7 +91,7 @@ class LineByLineTextDataset(Dataset):
with open(file_path, encoding="utf-8") as f:
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
self.examples = batch_encoding["input_ids"]
def __len__(self):
......
......@@ -137,8 +137,11 @@ def _glue_convert_examples_to_features(
labels = [label_from_example(example) for example in examples]
batch_encoding = tokenizer.batch_encode_plus(
[(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True,
batch_encoding = tokenizer(
[(example.text_a, example.text_b) for example in examples],
max_length=max_length,
padding="max_length",
truncation=True,
)
features = []
......
......@@ -120,7 +120,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
spans = []
truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
truncated_query = tokenizer.encode(
example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
)
sequence_added_tokens = (
tokenizer.max_len - tokenizer.max_len_single_sentence + 1
if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
......@@ -131,14 +133,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
span_doc_tokens = all_doc_tokens
while len(spans) * doc_stride < len(all_doc_tokens):
encoded_dict = tokenizer.encode_plus(
encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
truncation="only_second" if tokenizer.padding_side == "right" else "only_first",
padding="max_length",
max_length=max_seq_length,
return_overflowing_tokens=True,
pad_to_max_length=True,
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
return_token_type_ids=True,
)
......@@ -176,7 +178,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
spans.append(encoded_dict)
if "overflowing_tokens" not in encoded_dict:
if "overflowing_tokens" not in encoded_dict or (
"overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
):
break
span_doc_tokens = encoded_dict["overflowing_tokens"]
......
......@@ -278,7 +278,7 @@ PT_MULTIPLE_CHOICE_SAMPLE = r"""
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
>>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained
......@@ -391,7 +391,7 @@ TF_MULTIPLE_CHOICE_SAMPLE = r"""
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True)
>>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
>>> outputs = model(inputs) # batch size is 1
......
......@@ -402,7 +402,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -579,7 +579,7 @@ BERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
......
......@@ -251,7 +251,7 @@ CTRL_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.CTRLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
......
......@@ -360,7 +360,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.DistilBertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......@@ -893,7 +893,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
>>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
>>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained
......
......@@ -186,7 +186,7 @@ ELECTRA_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.ElectraTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -65,7 +65,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -302,7 +302,7 @@ GPT2_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
......
......@@ -454,7 +454,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.LonmgformerTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
......@@ -970,7 +970,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
>>> encoding = tokenizer(question, text, return_tensors="pt")
>>> input_ids = encoding["input_ids"]
>>> # default is local attention everywhere
......
......@@ -678,7 +678,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.MobileBertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -296,7 +296,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -1487,7 +1487,7 @@ REFORMER_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.ReformerTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment