Unverified Commit 5164ea91 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Skipping outputs (#3116)

* Minimal example

* Proposal 2

* Proposal 2 for fast tokenizers

* Typings

* Docs

* Revert "Docs" for easier review

This reverts commit eaf0f97062e809887704a542144c537f769d5223.

* Remove unnecessary assignments

* Tests

* Fix faulty type

* Remove prints

* return_outputs -> model_input_names

* Revert "Revert "Docs" for easier review"

This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722.

* code quality
parent 49debe62
......@@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"]
class DistilBertTokenizerFast(BertTokenizerFast):
......@@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"]
......@@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,
......@@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,
......
......@@ -22,6 +22,7 @@ import os
import re
from collections import defaultdict
from contextlib import contextmanager
from typing import List, Optional, Tuple, Union
from tokenizers.implementations import BaseTokenizer
......@@ -138,6 +139,7 @@ class PreTrainedTokenizer(object):
pretrained_vocab_files_map = {}
pretrained_init_configuration = {}
max_model_input_sizes = {}
model_input_names = ["token_type_ids", "attention_mask"]
SPECIAL_TOKENS_ATTRIBUTES = [
"bos_token",
......@@ -316,6 +318,7 @@ class PreTrainedTokenizer(object):
# Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
self.padding_side = kwargs.pop("padding_side", self.padding_side)
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
# Added tokens
self.added_tokens_encoder = {}
......@@ -849,14 +852,14 @@ class PreTrainedTokenizer(object):
def encode(
self,
text,
text_pair=None,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
text: str,
text_pair: Optional[str] = None,
add_special_tokens: bool = True,
max_length: Optional[int] = None,
stride: int = 0,
truncation_strategy: str = "longest_first",
pad_to_max_length: bool = False,
return_tensors: Optional[str] = None,
**kwargs
):
"""
......@@ -865,34 +868,43 @@ class PreTrainedTokenizer(object):
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args:
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
text (:obj:`str` or :obj:`List[str]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
If set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
If set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens.
truncation_strategy: string selected in the following options:
truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
If set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the
model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
which can be set to the following strings:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method
"""
encoded_inputs = self.encode_plus(
......@@ -911,59 +923,79 @@ class PreTrainedTokenizer(object):
def encode_plus(
self,
text,
text_pair=None,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
return_offsets_mapping=False,
text: str,
text_pair: Optional[str] = None,
add_special_tokens: bool = True,
max_length: Optional[int] = None,
stride: int = 0,
truncation_strategy: str = "longest_first",
pad_to_max_length: bool = False,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
**kwargs
):
"""
Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
Returns a dictionary containing the encoded sequence or sequence pair and additional information:
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args:
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
text (:obj:`str` or :obj:`List[str]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
If set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
If set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens.
truncation_strategy: string selected in the following options:
truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
If set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the
model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
which can be set to the following strings:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False).
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
Whether to return token type IDs. If left to the default, will return the token type IDs according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are token type IDs? <../glossary.html#token-type-ids>`_
return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return overflowing token information (default False).
return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return special tokens mask information (default False).
return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return (char_start, char_end) for each token (default False).
If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
**kwargs: passed to the `self.tokenize()` method
......@@ -981,13 +1013,14 @@ class PreTrainedTokenizer(object):
}
With the fields:
``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model
``attention_mask``: list of indices specifying which tokens should be attended to by the model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
- ``input_ids``: list of token ids to be fed to a model
- ``token_type_ids``: list of token type ids to be fed to a model
- ``attention_mask``: list of indices specifying which tokens should be attended to by the model
- ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
- ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
- ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
def get_input_ids(text):
......@@ -1038,19 +1071,19 @@ class PreTrainedTokenizer(object):
def batch_encode_plus(
self,
batch_text_or_text_pairs=None,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_masks=True,
return_overflowing_tokens=False,
return_special_tokens_masks=False,
return_offsets_mapping=False,
return_input_lengths=False,
batch_text_or_text_pairs: Union[str, List[str]],
add_special_tokens: bool = True,
max_length: Optional[int] = None,
stride: int = 0,
truncation_strategy: str = "longest_first",
pad_to_max_length: bool = False,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_masks: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_masks: bool = False,
return_offsets_mapping: bool = False,
return_input_lengths: bool = False,
**kwargs
):
"""
......@@ -1058,32 +1091,59 @@ class PreTrainedTokenizer(object):
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args:
batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded.
batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
Batch of sequences or pair of sequences to be encoded.
This can be a list of string/string-sequences/int-sequences or a list of pair of
string/string-sequences/int-sequence (see details in encode_plus)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
If set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary`
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
If set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens.
truncation_strategy: string selected in the following options:
truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
If set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the
model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
which can be set to the following strings:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
return_input_lengths: (optional) If set the resulting dictionary will include the length of each sample
return_attention_masks: (optional) Set to True to return the attention mask (default False)
return_offsets_mapping: (optional) Not available, should be set to False or it will throw NotImplementError
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
Whether to return token type IDs. If left to the default, will return the token type IDs according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are token type IDs? <../glossary.html#token-type-ids>`_
return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return overflowing token information (default False).
return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return special tokens mask information (default False).
return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return (char_start, char_end) for each token (default False).
If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
If set the resulting dictionary will include the length of each sample
**kwargs: passed to the `self.tokenize()` method
Return:
......@@ -1099,13 +1159,14 @@ class PreTrainedTokenizer(object):
}
With the fields:
``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model
``attention_mask``: list of indices specifying which tokens should be attended to by the model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
- ``input_ids``: list of token ids to be fed to a model
- ``token_type_ids``: list of token type ids to be fed to a model
- ``attention_mask``: list of indices specifying which tokens should be attended to by the model
- ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
- ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
- ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
def get_input_ids(text):
......@@ -1220,18 +1281,18 @@ class PreTrainedTokenizer(object):
def prepare_for_model(
self,
ids,
pair_ids=None,
max_length=None,
add_special_tokens=True,
stride=0,
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
ids: List[int],
pair_ids: Optional[List[int]] = None,
max_length: Optional[int] = None,
add_special_tokens: bool = True,
stride: int = 0,
truncation_strategy: str = "longest_first",
pad_to_max_length: bool = False,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
......@@ -1292,6 +1353,11 @@ class PreTrainedTokenizer(object):
len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
encoded_inputs = {}
# Handle max sequence length
......@@ -1617,6 +1683,9 @@ class PreTrainedTokenizer(object):
class PreTrainedTokenizerFast(PreTrainedTokenizer):
model_input_names = ["token_type_ids", "attention_mask"]
def __init__(self, tokenizer: BaseTokenizer, **kwargs):
if tokenizer is None:
raise ValueError("Provided tokenizer cannot be None")
......@@ -1685,16 +1754,21 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
if self._tokenizer is not None:
self._tokenizer.add_special_tokens(self.all_special_tokens)
@staticmethod
def _convert_encoding(
self,
encoding,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_token_type_ids=None,
return_attention_mask=None,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
return_offsets_mapping=False,
):
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
if return_overflowing_tokens and encoding.overflowing is not None:
encodings = [encoding] + encoding.overflowing
else:
......@@ -1774,18 +1848,18 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
def batch_encode_plus(
self,
batch_text_or_text_pairs=None,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
return_offsets_mapping=False,
batch_text_or_text_pairs: Optional[Union[List[str], List[Tuple[str]]]] = None,
add_special_tokens: bool = True,
max_length: Optional[int] = None,
stride: int = 0,
truncation_strategy: str = "longest_first",
pad_to_max_length: bool = False,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
**kwargs
):
if not add_special_tokens:
......@@ -1868,19 +1942,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
def encode_plus(
self,
text,
text_pair=None,
add_special_tokens=False,
max_length=None,
pad_to_max_length=False,
stride=0,
truncation_strategy="longest_first",
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
return_offsets_mapping=False,
text: str,
text_pair: Optional[str] = None,
add_special_tokens: bool = False,
max_length: Optional[int] = None,
pad_to_max_length: bool = False,
stride: int = 0,
truncation_strategy: str = "longest_first",
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
**kwargs
):
batched_input = [(text, text_pair)] if text_pair else [text]
......
......@@ -48,7 +48,7 @@ class TokenizerTesterMixin:
# to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
return [
{value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
for i in range(len(batch_encode_plus_sequences))
for i in range(len(batch_encode_plus_sequences["input_ids"]))
]
def test_tokenizers_common_properties(self):
......@@ -261,7 +261,10 @@ class TokenizerTesterMixin:
def test_mask_output(self):
tokenizer = self.get_tokenizer()
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
if (
tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
and "token_type_ids" in tokenizer.model_input_names
):
seq_0 = "Test this method."
seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
......@@ -504,51 +507,58 @@ class TokenizerTesterMixin:
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence["input_ids"]
token_type_ids = encoded_sequence["token_type_ids"]
attention_mask = encoded_sequence["attention_mask"]
special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids)
# Test right padding
tokenizer.padding_side = "right"
padded_sequence = tokenizer.encode_plus(
right_padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
pad_to_max_length=True,
return_special_tokens_mask=True,
)
padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
right_padded_input_ids = right_padded_sequence["input_ids"]
assert sequence_length + padding_size == padded_sequence_length
assert input_ids + [padding_idx] * padding_size == padded_input_ids
assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
assert attention_mask + [0] * padding_size == padded_attention_mask
assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
right_padded_sequence_length = len(right_padded_input_ids)
assert sequence_length + padding_size == right_padded_sequence_length
assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
# Test left padding
tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode_plus(
left_padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
pad_to_max_length=True,
return_special_tokens_mask=True,
)
padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + input_ids == padded_input_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
assert [0] * padding_size + attention_mask == padded_attention_mask
assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
assert sequence_length + padding_size == left_padded_sequence_length
assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
if "token_type_ids" in tokenizer.model_input_names:
token_type_ids = encoded_sequence["token_type_ids"]
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
if "attention_mask" in tokenizer.model_input_names:
attention_mask = encoded_sequence["attention_mask"]
right_padded_attention_mask = right_padded_sequence["attention_mask"]
left_padded_attention_mask = left_padded_sequence["attention_mask"]
assert attention_mask + [0] * padding_size == right_padded_attention_mask
assert [0] * padding_size + attention_mask == left_padded_attention_mask
def test_separate_tokenizers(self):
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment