"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "b0cbcdb05b39e6c81db049d2b4d7dfc5d823210d"
Unverified Commit 5164ea91 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Skipping outputs (#3116)

* Minimal example

* Proposal 2

* Proposal 2 for fast tokenizers

* Typings

* Docs

* Revert "Docs" for easier review

This reverts commit eaf0f97062e809887704a542144c537f769d5223.

* Remove unnecessary assignments

* Tests

* Fix faulty type

* Remove prints

* return_outputs -> model_input_names

* Revert "Revert "Docs" for easier review"

This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722.

* code quality
parent 49debe62
...@@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer): ...@@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"]
class DistilBertTokenizerFast(BertTokenizerFast): class DistilBertTokenizerFast(BertTokenizerFast):
...@@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast): ...@@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"]
...@@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
...@@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast): ...@@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -22,6 +22,7 @@ import os ...@@ -22,6 +22,7 @@ import os
import re import re
from collections import defaultdict from collections import defaultdict
from contextlib import contextmanager from contextlib import contextmanager
from typing import List, Optional, Tuple, Union
from tokenizers.implementations import BaseTokenizer from tokenizers.implementations import BaseTokenizer
...@@ -138,6 +139,7 @@ class PreTrainedTokenizer(object): ...@@ -138,6 +139,7 @@ class PreTrainedTokenizer(object):
pretrained_vocab_files_map = {} pretrained_vocab_files_map = {}
pretrained_init_configuration = {} pretrained_init_configuration = {}
max_model_input_sizes = {} max_model_input_sizes = {}
model_input_names = ["token_type_ids", "attention_mask"]
SPECIAL_TOKENS_ATTRIBUTES = [ SPECIAL_TOKENS_ATTRIBUTES = [
"bos_token", "bos_token",
...@@ -316,6 +318,7 @@ class PreTrainedTokenizer(object): ...@@ -316,6 +318,7 @@ class PreTrainedTokenizer(object):
# Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed. # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
self.padding_side = kwargs.pop("padding_side", self.padding_side) self.padding_side = kwargs.pop("padding_side", self.padding_side)
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
# Added tokens # Added tokens
self.added_tokens_encoder = {} self.added_tokens_encoder = {}
...@@ -849,14 +852,14 @@ class PreTrainedTokenizer(object): ...@@ -849,14 +852,14 @@ class PreTrainedTokenizer(object):
def encode( def encode(
self, self,
text, text: str,
text_pair=None, text_pair: Optional[str] = None,
add_special_tokens=True, add_special_tokens: bool = True,
max_length=None, max_length: Optional[int] = None,
stride=0, stride: int = 0,
truncation_strategy="longest_first", truncation_strategy: str = "longest_first",
pad_to_max_length=False, pad_to_max_length: bool = False,
return_tensors=None, return_tensors: Optional[str] = None,
**kwargs **kwargs
): ):
""" """
...@@ -865,34 +868,43 @@ class PreTrainedTokenizer(object): ...@@ -865,34 +868,43 @@ class PreTrainedTokenizer(object):
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args: Args:
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using text (:obj:`str` or :obj:`List[str]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method) method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method) `convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
If set to ``True``, the sequences will be encoded with the special tokens relative
to their model. to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
If set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens. from the main sequence returned. The value of this argument defines the number of additional tokens.
truncation_strategy: string selected in the following options: truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences) starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence - 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence - 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. If set to True, the returned sequences will be padded according to the model's padding side and
The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: padding index, up to their max length. If no max length is specified, the padding is done up to the
model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
which can be set to the following strings:
- 'left': pads on the left of the sequences - 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences - 'right': pads on the right of the sequences
Defaults to False: no padding. Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
or PyTorch torch.Tensor instead of a list of python integers. Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
""" """
encoded_inputs = self.encode_plus( encoded_inputs = self.encode_plus(
...@@ -911,59 +923,79 @@ class PreTrainedTokenizer(object): ...@@ -911,59 +923,79 @@ class PreTrainedTokenizer(object):
def encode_plus( def encode_plus(
self, self,
text, text: str,
text_pair=None, text_pair: Optional[str] = None,
add_special_tokens=True, add_special_tokens: bool = True,
max_length=None, max_length: Optional[int] = None,
stride=0, stride: int = 0,
truncation_strategy="longest_first", truncation_strategy: str = "longest_first",
pad_to_max_length=False, pad_to_max_length: bool = False,
return_tensors=None, return_tensors: Optional[str] = None,
return_token_type_ids=True, return_token_type_ids: Optional[bool] = None,
return_attention_mask=True, return_attention_mask: Optional[bool] = None,
return_overflowing_tokens=False, return_overflowing_tokens: bool = False,
return_special_tokens_mask=False, return_special_tokens_mask: bool = False,
return_offsets_mapping=False, return_offsets_mapping: bool = False,
**kwargs **kwargs
): ):
""" """
Returns a dictionary containing the encoded sequence or sequence pair and additional informations: Returns a dictionary containing the encoded sequence or sequence pair and additional information:
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args: Args:
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using text (:obj:`str` or :obj:`List[str]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method) method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method) `convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
If set to ``True``, the sequences will be encoded with the special tokens relative
to their model. to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
If set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens. from the main sequence returned. The value of this argument defines the number of additional tokens.
truncation_strategy: string selected in the following options: truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences) starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence - 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence - 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. If set to True, the returned sequences will be padded according to the model's padding side and
The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: padding index, up to their max length. If no max length is specified, the padding is done up to the
model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
which can be set to the following strings:
- 'left': pads on the left of the sequences - 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences - 'right': pads on the right of the sequences
Defaults to False: no padding. Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
or PyTorch torch.Tensor instead of a list of python integers. Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`. return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). Whether to return token type IDs. If left to the default, will return the token type IDs according
return_attention_mask: (optional) Set to False to avoid returning attention mask (default True) to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). `What are token type IDs? <../glossary.html#token-type-ids>`_
return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False). return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return overflowing token information (default False).
return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return special tokens mask information (default False).
return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return (char_start, char_end) for each token (default False).
If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
Rust-based tokenizers inheriting from PreTrainedTokenizerFast. Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
...@@ -981,13 +1013,14 @@ class PreTrainedTokenizer(object): ...@@ -981,13 +1013,14 @@ class PreTrainedTokenizer(object):
} }
With the fields: With the fields:
``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model - ``input_ids``: list of token ids to be fed to a model
``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``token_type_ids``: list of token type ids to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
tokens and 1 specifying sequence tokens. - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
""" """
def get_input_ids(text): def get_input_ids(text):
...@@ -1038,19 +1071,19 @@ class PreTrainedTokenizer(object): ...@@ -1038,19 +1071,19 @@ class PreTrainedTokenizer(object):
def batch_encode_plus( def batch_encode_plus(
self, self,
batch_text_or_text_pairs=None, batch_text_or_text_pairs: Union[str, List[str]],
add_special_tokens=True, add_special_tokens: bool = True,
max_length=None, max_length: Optional[int] = None,
stride=0, stride: int = 0,
truncation_strategy="longest_first", truncation_strategy: str = "longest_first",
pad_to_max_length=False, pad_to_max_length: bool = False,
return_tensors=None, return_tensors: Optional[str] = None,
return_token_type_ids=True, return_token_type_ids: Optional[bool] = None,
return_attention_masks=True, return_attention_masks: Optional[bool] = None,
return_overflowing_tokens=False, return_overflowing_tokens: bool = False,
return_special_tokens_masks=False, return_special_tokens_masks: bool = False,
return_offsets_mapping=False, return_offsets_mapping: bool = False,
return_input_lengths=False, return_input_lengths: bool = False,
**kwargs **kwargs
): ):
""" """
...@@ -1058,32 +1091,59 @@ class PreTrainedTokenizer(object): ...@@ -1058,32 +1091,59 @@ class PreTrainedTokenizer(object):
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args: Args:
batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded. batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
Batch of sequences or pair of sequences to be encoded.
This can be a list of string/string-sequences/int-sequences or a list of pair of This can be a list of string/string-sequences/int-sequences or a list of pair of
string/string-sequences/int-sequence (see details in encode_plus) string/string-sequences/int-sequence (see details in encode_plus)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
If set to ``True``, the sequences will be encoded with the special tokens relative
to their model. to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
If there are overflowing tokens, those will be added to the returned dictionary` If set to a number, will limit the total sequence returned so that it has a maximum length.
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens If there are overflowing tokens, those will be added to the returned dictionary
stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens. from the main sequence returned. The value of this argument defines the number of additional tokens.
truncation_strategy: string selected in the following options: truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences) starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence - 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence - 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. If set to True, the returned sequences will be padded according to the model's padding side and
The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings: padding index, up to their max length. If no max length is specified, the padding is done up to the
model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
which can be set to the following strings:
- 'left': pads on the left of the sequences - 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences - 'right': pads on the right of the sequences
Defaults to False: no padding. Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
or PyTorch torch.Tensor instead of a list of python integers. Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
return_input_lengths: (optional) If set the resulting dictionary will include the length of each sample or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
return_attention_masks: (optional) Set to True to return the attention mask (default False) return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
return_offsets_mapping: (optional) Not available, should be set to False or it will throw NotImplementError Whether to return token type IDs. If left to the default, will return the token type IDs according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are token type IDs? <../glossary.html#token-type-ids>`_
return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return overflowing token information (default False).
return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return special tokens mask information (default False).
return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return (char_start, char_end) for each token (default False).
If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
If set the resulting dictionary will include the length of each sample
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
Return: Return:
...@@ -1099,13 +1159,14 @@ class PreTrainedTokenizer(object): ...@@ -1099,13 +1159,14 @@ class PreTrainedTokenizer(object):
} }
With the fields: With the fields:
``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model - ``input_ids``: list of token ids to be fed to a model
``attention_mask``: list of indices specifying which tokens should be attended to by the model - ``token_type_ids``: list of token type ids to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
tokens and 1 specifying sequence tokens. - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
""" """
def get_input_ids(text): def get_input_ids(text):
...@@ -1220,18 +1281,18 @@ class PreTrainedTokenizer(object): ...@@ -1220,18 +1281,18 @@ class PreTrainedTokenizer(object):
def prepare_for_model( def prepare_for_model(
self, self,
ids, ids: List[int],
pair_ids=None, pair_ids: Optional[List[int]] = None,
max_length=None, max_length: Optional[int] = None,
add_special_tokens=True, add_special_tokens: bool = True,
stride=0, stride: int = 0,
truncation_strategy="longest_first", truncation_strategy: str = "longest_first",
pad_to_max_length=False, pad_to_max_length: bool = False,
return_tensors=None, return_tensors: Optional[str] = None,
return_token_type_ids=True, return_token_type_ids: Optional[bool] = None,
return_attention_mask=True, return_attention_mask: Optional[bool] = None,
return_overflowing_tokens=False, return_overflowing_tokens: bool = False,
return_special_tokens_mask=False, return_special_tokens_mask: bool = False,
): ):
""" """
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
...@@ -1292,6 +1353,11 @@ class PreTrainedTokenizer(object): ...@@ -1292,6 +1353,11 @@ class PreTrainedTokenizer(object):
len_ids = len(ids) len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0 len_pair_ids = len(pair_ids) if pair else 0
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
encoded_inputs = {} encoded_inputs = {}
# Handle max sequence length # Handle max sequence length
...@@ -1617,6 +1683,9 @@ class PreTrainedTokenizer(object): ...@@ -1617,6 +1683,9 @@ class PreTrainedTokenizer(object):
class PreTrainedTokenizerFast(PreTrainedTokenizer): class PreTrainedTokenizerFast(PreTrainedTokenizer):
model_input_names = ["token_type_ids", "attention_mask"]
def __init__(self, tokenizer: BaseTokenizer, **kwargs): def __init__(self, tokenizer: BaseTokenizer, **kwargs):
if tokenizer is None: if tokenizer is None:
raise ValueError("Provided tokenizer cannot be None") raise ValueError("Provided tokenizer cannot be None")
...@@ -1685,16 +1754,21 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): ...@@ -1685,16 +1754,21 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
if self._tokenizer is not None: if self._tokenizer is not None:
self._tokenizer.add_special_tokens(self.all_special_tokens) self._tokenizer.add_special_tokens(self.all_special_tokens)
@staticmethod
def _convert_encoding( def _convert_encoding(
self,
encoding, encoding,
return_tensors=None, return_tensors=None,
return_token_type_ids=True, return_token_type_ids=None,
return_attention_mask=True, return_attention_mask=None,
return_overflowing_tokens=False, return_overflowing_tokens=False,
return_special_tokens_mask=False, return_special_tokens_mask=False,
return_offsets_mapping=False, return_offsets_mapping=False,
): ):
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
if return_overflowing_tokens and encoding.overflowing is not None: if return_overflowing_tokens and encoding.overflowing is not None:
encodings = [encoding] + encoding.overflowing encodings = [encoding] + encoding.overflowing
else: else:
...@@ -1774,18 +1848,18 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): ...@@ -1774,18 +1848,18 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
def batch_encode_plus( def batch_encode_plus(
self, self,
batch_text_or_text_pairs=None, batch_text_or_text_pairs: Optional[Union[List[str], List[Tuple[str]]]] = None,
add_special_tokens=True, add_special_tokens: bool = True,
max_length=None, max_length: Optional[int] = None,
stride=0, stride: int = 0,
truncation_strategy="longest_first", truncation_strategy: str = "longest_first",
pad_to_max_length=False, pad_to_max_length: bool = False,
return_tensors=None, return_tensors: Optional[str] = None,
return_token_type_ids=True, return_token_type_ids: Optional[bool] = None,
return_attention_mask=True, return_attention_mask: Optional[bool] = None,
return_overflowing_tokens=False, return_overflowing_tokens: bool = False,
return_special_tokens_mask=False, return_special_tokens_mask: bool = False,
return_offsets_mapping=False, return_offsets_mapping: bool = False,
**kwargs **kwargs
): ):
if not add_special_tokens: if not add_special_tokens:
...@@ -1868,19 +1942,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): ...@@ -1868,19 +1942,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
def encode_plus( def encode_plus(
self, self,
text, text: str,
text_pair=None, text_pair: Optional[str] = None,
add_special_tokens=False, add_special_tokens: bool = False,
max_length=None, max_length: Optional[int] = None,
pad_to_max_length=False, pad_to_max_length: bool = False,
stride=0, stride: int = 0,
truncation_strategy="longest_first", truncation_strategy: str = "longest_first",
return_tensors=None, return_tensors: Optional[bool] = None,
return_token_type_ids=True, return_token_type_ids: Optional[bool] = None,
return_attention_mask=True, return_attention_mask: Optional[bool] = None,
return_overflowing_tokens=False, return_overflowing_tokens: bool = False,
return_special_tokens_mask=False, return_special_tokens_mask: bool = False,
return_offsets_mapping=False, return_offsets_mapping: bool = False,
**kwargs **kwargs
): ):
batched_input = [(text, text_pair)] if text_pair else [text] batched_input = [(text, text_pair)] if text_pair else [text]
......
...@@ -48,7 +48,7 @@ class TokenizerTesterMixin: ...@@ -48,7 +48,7 @@ class TokenizerTesterMixin:
# to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}] # to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
return [ return [
{value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()} {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
for i in range(len(batch_encode_plus_sequences)) for i in range(len(batch_encode_plus_sequences["input_ids"]))
] ]
def test_tokenizers_common_properties(self): def test_tokenizers_common_properties(self):
...@@ -261,7 +261,10 @@ class TokenizerTesterMixin: ...@@ -261,7 +261,10 @@ class TokenizerTesterMixin:
def test_mask_output(self): def test_mask_output(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer": if (
tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
and "token_type_ids" in tokenizer.model_input_names
):
seq_0 = "Test this method." seq_0 = "Test this method."
seq_1 = "With these inputs." seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
...@@ -504,51 +507,58 @@ class TokenizerTesterMixin: ...@@ -504,51 +507,58 @@ class TokenizerTesterMixin:
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence["input_ids"] input_ids = encoded_sequence["input_ids"]
token_type_ids = encoded_sequence["token_type_ids"]
attention_mask = encoded_sequence["attention_mask"]
special_tokens_mask = encoded_sequence["special_tokens_mask"] special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids) sequence_length = len(input_ids)
# Test right padding # Test right padding
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
padded_sequence = tokenizer.encode_plus( right_padded_sequence = tokenizer.encode_plus(
sequence, sequence,
max_length=sequence_length + padding_size, max_length=sequence_length + padding_size,
pad_to_max_length=True, pad_to_max_length=True,
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
padded_input_ids = padded_sequence["input_ids"] right_padded_input_ids = right_padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
assert input_ids + [padding_idx] * padding_size == padded_input_ids right_padded_sequence_length = len(right_padded_input_ids)
assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
assert attention_mask + [0] * padding_size == padded_attention_mask assert sequence_length + padding_size == right_padded_sequence_length
assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
# Test left padding # Test left padding
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode_plus( left_padded_sequence = tokenizer.encode_plus(
sequence, sequence,
max_length=sequence_length + padding_size, max_length=sequence_length + padding_size,
pad_to_max_length=True, pad_to_max_length=True,
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
padded_input_ids = padded_sequence["input_ids"] left_padded_input_ids = left_padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
padded_attention_mask = padded_sequence["attention_mask"] left_padded_sequence_length = len(left_padded_input_ids)
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length assert sequence_length + padding_size == left_padded_sequence_length
assert [padding_idx] * padding_size + input_ids == padded_input_ids assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
assert [0] * padding_size + attention_mask == padded_attention_mask
assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask if "token_type_ids" in tokenizer.model_input_names:
token_type_ids = encoded_sequence["token_type_ids"]
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
if "attention_mask" in tokenizer.model_input_names:
attention_mask = encoded_sequence["attention_mask"]
right_padded_attention_mask = right_padded_sequence["attention_mask"]
left_padded_attention_mask = left_padded_sequence["attention_mask"]
assert attention_mask + [0] * padding_size == right_padded_attention_mask
assert [0] * padding_size + attention_mask == left_padded_attention_mask
def test_separate_tokenizers(self): def test_separate_tokenizers(self):
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment