Unverified Commit 601d4d69 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors

* style

* fix tests - better type checking error messages

* better type checking

* include awesome fix by @LysandreJik for #5310

* updated doc and examples
parent fd405e9a
......@@ -153,7 +153,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
Indices can be obtained using :class:`transformers.RetriBertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -103,7 +103,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.RobertaTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
......
......@@ -674,7 +674,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional, defaults to :obj:`None`):
......
......@@ -664,7 +664,7 @@ BERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
......@@ -882,7 +882,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf')
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
assert logits[0][0] < logits[0][1] # the next sentence was random
......
......@@ -437,7 +437,7 @@ CTRL_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.CTRLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
......
......@@ -545,7 +545,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -339,7 +339,7 @@ ELECTRA_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.ElectraTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -60,7 +60,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on padding token indices.
......
......@@ -441,7 +441,7 @@ GPT2_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
......
......@@ -794,7 +794,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.MobileBertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
......
......@@ -405,7 +405,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -156,7 +156,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.RobertaTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -694,7 +694,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
......
......@@ -555,7 +555,7 @@ XLM_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -778,7 +778,7 @@ XLNET_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.XLNetTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -609,7 +609,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
......
......@@ -259,7 +259,7 @@ XLM_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
......@@ -573,7 +573,7 @@ XLNET_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
:func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
......
......@@ -456,17 +456,14 @@ class Pipeline(_ScikitCompat):
"""
return {name: tensor.to(self.device) for name, tensor in inputs.items()}
def _parse_and_tokenize(self, *args, pad_to_max_length=True, add_special_tokens=True, **kwargs):
def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs):
"""
Parse arguments and tokenize
"""
# Parse arguments
inputs = self._args_parser(*args, **kwargs)
inputs = self.tokenizer.batch_encode_plus(
inputs,
add_special_tokens=add_special_tokens,
return_tensors=self.framework,
pad_to_max_length=pad_to_max_length,
inputs = self.tokenizer(
inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding,
)
return inputs
......@@ -623,10 +620,10 @@ class TextGenerationPipeline(Pipeline):
with self.device_placement():
if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]:
inputs = self._parse_and_tokenize(
self.PADDING_TEXT + prompt_text, pad_to_max_length=False, add_special_tokens=False
self.PADDING_TEXT + prompt_text, padding=False, add_special_tokens=False
)
else:
inputs = self._parse_and_tokenize(prompt_text, pad_to_max_length=False, add_special_tokens=False)
inputs = self._parse_and_tokenize(prompt_text, padding=False, add_special_tokens=False)
# set input_ids to None to allow empty prompt
if inputs["input_ids"].shape[-1] == 0:
......@@ -920,11 +917,8 @@ class TokenClassificationPipeline(Pipeline):
# Manage correct placement of the tensors
with self.device_placement():
tokens = self.tokenizer.encode_plus(
sentence,
return_attention_mask=False,
return_tensors=self.framework,
max_length=self.tokenizer.max_len,
tokens = self.tokenizer(
sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True,
)
# Forward
......@@ -1187,12 +1181,12 @@ class QuestionAnsweringPipeline(Pipeline):
examples = self._args_parser(*args, **kwargs)
features_list = [
squad_convert_examples_to_features(
[example],
self.tokenizer,
kwargs["max_seq_len"],
kwargs["doc_stride"],
kwargs["max_question_len"],
False,
examples=[example],
tokenizer=self.tokenizer,
max_seq_length=kwargs["max_seq_len"],
doc_stride=kwargs["doc_stride"],
max_query_length=kwargs["max_question_len"],
is_training=False,
tqdm_enabled=False,
)
for example in examples
......@@ -1431,11 +1425,11 @@ class SummarizationPipeline(Pipeline):
), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
documents = ([prefix + document for document in documents[0]],)
pad_to_max_length = True
padding = True
elif isinstance(documents[0], str):
documents = (prefix + documents[0],)
pad_to_max_length = False
padding = False
else:
raise ValueError(
" `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
......@@ -1444,7 +1438,7 @@ class SummarizationPipeline(Pipeline):
)
with self.device_placement():
inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length)
inputs = self._parse_and_tokenize(*documents, padding=padding)
if self.framework == "pt":
inputs = self.ensure_tensor_on_device(**inputs)
......@@ -1549,11 +1543,11 @@ class TranslationPipeline(Pipeline):
self.tokenizer.pad_token_id is not None
), "Please make sure that the tokenizer has a pad_token_id when using a batch input"
args = ([prefix + text for text in args[0]],)
pad_to_max_length = True
padding = True
elif isinstance(args[0], str):
args = (prefix + args[0],)
pad_to_max_length = False
padding = False
else:
raise ValueError(
" `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format(
......@@ -1562,7 +1556,7 @@ class TranslationPipeline(Pipeline):
)
with self.device_placement():
inputs = self._parse_and_tokenize(*args, pad_to_max_length=pad_to_max_length)
inputs = self._parse_and_tokenize(*args, padding=padding)
if self.framework == "pt":
inputs = self.ensure_tensor_on_device(**inputs)
......
......@@ -263,7 +263,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment