"vscode:/vscode.git/clone" did not exist on "2bae14d20daff8d1a72846e0b9b1988d6741bb65"
Unverified Commit 4ef0abb7 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add TAPEX (#16473)



* Add TapexTokenizer

* Improve docstrings and provide option to provide answer

* Remove option for pretokenized inputs

* Add TAPEX to README

* Fix copies

* Remove option for pretokenized inputs

* Initial commit: add tapex fine-tuning examples on both table-based question answering and table-based fact verification.

* - Draft a README file for running the script and introducing some background.
- Remove unused code lines in tabfact script.
- Disable the deafult `pad_to_max_length` option which is memory-consuming.

* * Support `as_target_tokenizer` function for TapexTokenizer.
* Fix the do_lower_case behaviour of TapexTokenizer.
* Add unit tests for target scenarios and cased/uncased scenarios for both source and target.

* * Replace the label BartTokenizer with TapexTokenizer's as_target_tokenizer function.
* Fix typos in tapex example README.

* * fix the evaluation script - remove the property `task_name`

* * Make the label space more clear for tabfact tasks

* * Using a new fine-tuning script for tapex-base on tabfact.

* * Remove the lowercase code outside the tokenizer - we use the tokenizer to control whether do_lower_case
* Guarantee the hyper-parameter can be run without out-of-memory on 16GB card and report the new reproduced number on wikisql

* * Remove the default tokenizer_name option.
* Provide evaluation command.

* * Support for WikiTableQuestion dataset.

* Fix a typo in README.

* * Fix the datasets's key name in WikiTableQuestions

* Run make fixup and move test to folder

* Fix quality

* Apply suggestions from code review

* Apply suggestions from code review
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>

* Apply suggestions from code review

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Apply some more suggestions from code review

* Improve docstrings

* Overwrite failing test

* Improve comment in example scripts

* Fix rebase

* Add TAPEX to Auto mapping

* Add TAPEX to auto config mappings

* Put TAPEX higher than BART in auto mapping

* Add TAPEX to doc tests
Co-authored-by: default avatarNiels Rogge <nielsrogge@Nielss-MBP.localdomain>
Co-authored-by: default avatarSivilTaram <qianlxc@outlook.com>
Co-authored-by: default avatarNiels Rogge <nielsrogge@nielss-mbp.home>
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarNiels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
parent 33cb2115
# coding=utf-8
# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for TAPEX."""
import json
import os
import random
from contextlib import contextmanager
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union
import regex as re
from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
from ...utils import logging
if is_pandas_available():
import pandas as pd
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/vocab.json",
},
"merges_file": {
"microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/tapex-base": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/tapex-base": {"do_lower_case": True},
}
class TapexTruncationStrategy(ExplicitEnum):
"""
Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for tab-completion in an IDE.
"""
DROP_ROWS_TO_FIT = "drop_rows_to_fit"
class TokenizerStrategy(ExplicitEnum):
TOKENIZE_SOURCE = "tokenize_source"
TOKENIZE_TARGET = "tokenize_target"
TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
add_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not to encode the sequences with the special tokens relative to their model.
padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
truncation (`bool`, `str`, [`TapexTruncationStrategy`] or [`~tokenization_utils_base.TruncationStrategy`],
*optional*, defaults to `False`):
Activates and controls truncation. Accepts the following values:
- `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. This will truncate
row by row, removing rows from the table.
- `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will
truncate token by token, removing a token from the longest sequence in the pair if a pair of
sequences (or a batch of pairs) is provided.
- `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
greater than the model maximum admissible input size).
max_length (`int`, *optional*):
Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
`None`, this will use the predefined model maximum length if a maximum length is required by one of the
truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
truncation/padding to a maximum length will be deactivated.
stride (`int`, *optional*, defaults to 0):
If set to a number along with `max_length`, the overflowing tokens returned when
`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
returned to provide some overlap between truncated and overflowing sequences. The value of this
argument defines the number of overlapping tokens.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
"""
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large #
of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset
you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe
vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class IndexedRowTableLinearize:
"""
FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
"""
def process_table(self, table_content: Dict):
"""
Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE
# process header
table_str = self.process_header(table_content["header"]) + " "
# process rows
for i, row_example in enumerate(table_content["rows"]):
# NOTE: the row should start from row 1 instead of 0
table_str += self.process_row(row_example, row_index=i + 1) + " "
return table_str.strip()
def process_header(self, headers: List):
"""
Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
return "col : " + " | ".join(headers)
def process_row(self, row: List, row_index: int):
"""
Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
row_str = ""
row_cell_values = []
for cell_value in row:
if isinstance(cell_value, int):
row_cell_values.append(str(cell_value))
else:
row_cell_values.append(cell_value)
row_str += " | ".join(row_cell_values)
return "row " + str(row_index) + " : " + row_str
class TapexTokenizer(PreTrainedTokenizer):
r"""
Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences
to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following:
sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table
will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated
for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to
the tokenizer for instance to prepare them for the model.
Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (BART tokenizer detect beginning of words by the preceding space).
max_cell_length (`int`, *optional*, defaults to 15):
Maximum number of characters per cell when linearizing a table. If this number is exceeded, truncation
takes place.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
do_lower_case=True,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
max_cell_length=15,
**kwargs
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
do_lower_case=do_lower_case,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
max_cell_length=max_cell_length,
**kwargs,
)
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.do_lower_case = do_lower_case
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
# additional properties
self.max_cell_length = max_cell_length
self.table_linearize = IndexedRowTableLinearize()
# property to decide using which call function
self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A TAPEX sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Args:
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Args:
Create a mask from the two sequences passed to be used in a sequence-pair classification task. TAPEX does not:
make use of token type ids, therefore a list of zeros is returned.
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def __call__(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]] = None,
query: Optional[Union[TextInput, List[TextInput]]] = None,
answer: Union[str, List[str]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Main method to tokenize and prepare for the model one or several table-sequence pair(s).
Args:
table (`pd.DataFrame`, `List[pd.DataFrame]`):
Table(s) containing tabular data.
query (`str` or `List[str]`, *optional*):
Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of
sentences must match the number of tables.
answer (`str` or `List[str]`, *optional*):
Optionally, the corresponding answer to the questions as supervision.
"""
if self.current_tokenizer == TokenizerStrategy.TOKENIZE_SOURCE:
if table is None:
raise ValueError("Please ensure that the table is not empty if you use TAPEX to encode source.")
return self.source_call_func(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
if answer is None:
raise ValueError("Please ensure that the answer is not empty if you use TAPEX to encode target.")
return self.target_call_func(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def source_call_func(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[Union[TextInput, List[TextInput]]] = None,
answer: Union[str, List[str]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
# Input type checking for clearer error
valid_table = False
valid_query = False
# Check that table have a valid type
if isinstance(table, pd.DataFrame):
valid_table = True
elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame):
valid_table = True
# Check that query have a valid type
if query is None or isinstance(query, str):
valid_query = True
elif isinstance(query, (list, tuple)):
if len(query) == 0 or isinstance(query[0], str):
valid_query = True
if not valid_table:
raise ValueError(
"table input must of type `pd.DataFrame` (single example), `List[pd.DataFrame]` (batch of examples). "
)
if not valid_query:
raise ValueError("query input must of type `str` (single example), `List[str]` (batch of examples). ")
is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple))
if is_batched:
return self.batch_encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
return self.encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[List[TextInput]] = None,
answer: List[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._batch_encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _batch_encode_plus(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[List[TextInput]] = None,
answer: Optional[List[str]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)):
# single table, many queries case
# duplicate table for every query
table = [table] * len(query)
if isinstance(table, (list, tuple)) and isinstance(query, str):
# many tables, single query case
# duplicate query for every table
query = [query] * len(table)
batch_outputs = self._batch_prepare_for_model(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
return BatchEncoding(batch_outputs)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def _batch_prepare_for_model(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[Union[TextInput, List[TextInput]]] = None,
answer: Optional[Union[str, List[str]]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
"""
This method adds special tokens, truncates sequences if overflowing while taking into account the special
tokens and manages a moving window (with user defined stride) for overflowing tokens.
"""
batch_outputs = {}
if answer is None:
answer = [None] * len(table)
for _table, _query, _answer in zip(table, query, answer):
text = self.prepare_table_query(
_table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length
)
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
outputs = self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterwards
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterwards
return_attention_mask=False, # we pad in batch afterwards
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None, # We convert the whole batch to tensors at the end
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
def encode(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = False,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> List[int]:
"""
Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build
your processing on your own, otherwise refer to `__call__`.
"""
encoded_inputs = self.encode_plus(
table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
**kwargs,
)
return encoded_inputs["input_ids"]
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _encode_plus(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
text = self.prepare_table_query(
table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length
)
# if necessary, perform lower case
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
return self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
def target_call_func(
self,
answer: Union[str, List[str]],
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
The method tokenizes and prepares the answer label for the model.
Args:
answer (`str` or `List[str]`):
Corresponding answer supervision to the queries for training the model.
"""
is_batched = isinstance(answer, (list, tuple))
if is_batched:
return self.target_batch_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
return self.target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def target_batch_encode_plus(
self,
answer: List[str],
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Prepare answer strings for the model.
Args:
answer `List[str]`:
Corresponding answer supervision to the queries for training the model.
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._target_batch_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _target_batch_encode_plus(
self,
answer: List[str],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
batch_outputs = {}
for text in answer:
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
outputs = self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterwards
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterwards
return_attention_mask=False, # we pad in batch afterwards
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None, # We convert the whole batch to tensors at the end
prepend_batch_axis=False,
verbose=verbose,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return BatchEncoding(batch_outputs)
def target_encode(
self,
answer: str,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = False,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> List[int]:
"""
Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
which are necessary for the model to work correctly. Use this method if you want to build your processing on
your own, otherwise refer to `__call__`.
Args:
answer `str`:
Corresponding answer supervision to the queries for training the model
"""
encoded_outputs = self.target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
**kwargs,
)
return encoded_outputs["input_ids"]
def target_encode_plus(
self,
answer: str,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Prepare a answer string for the model.
Args:
answer `str`:
Corresponding answer supervision to the queries for training the model.
"""
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
return self._target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _target_encode_plus(
self,
answer: str,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
text = answer
# if necessary, perform lower case
if self.do_lower_case:
text = text.lower()
tokens = self.tokenize(text)
return self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
@contextmanager
def as_target_tokenizer(self):
"""
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
sequence-to-sequence models that need a slightly different processing for the labels.
"""
self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET
yield
# restore the call function
self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
def prepare_table_query(
self,
table,
query,
answer=None,
truncation_strategy=Union[str, TruncationStrategy, TapexTruncationStrategy],
max_length=None,
):
"""
This method can be used to linearize a table and add a corresponding query.
Optionally, it also handles truncation of the table (cells).
An answer can be provided for more precise truncation.
"""
if not table.empty:
# step 1: create table dictionary
table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]}
# step 2: modify table internally
# always truncate table cells based on self.max_cell_length
# optionally truncate rows if truncation_strategy is set to it
self.truncate_table_cells(table_content, query, answer)
if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT:
self.truncate_table_rows(table_content, query, answer, max_length=max_length)
# step 3: linearize table
linear_table = self.table_linearize.process_table(table_content)
else:
linear_table = ""
if linear_table == "":
logger.warning(
"You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). "
+ f"Please carefully check the corresponding table with the query : {query}."
)
if query == "":
logger.warning("You provide nothing to query with respect to the table.")
# step 4: concatenate query with linear_table
separator = " " if query and linear_table else ""
joint_input = (query + separator + linear_table) if query else linear_table
return joint_input
def truncate_table_cells(self, table_content: Dict, question: str, answer: List):
# TODO (Qian): is it possible to revert the original cell if it is in the final answer?
cell_mapping = {}
for row in table_content["rows"]:
for i, cell in enumerate(row):
truncate_cell = self.truncate_cell(cell)
if truncate_cell is not None:
cell_mapping[cell] = truncate_cell
row[i] = truncate_cell
# modify the answer list
if answer is not None:
for i, case in enumerate(answer):
if case in cell_mapping.keys():
answer[i] = cell_mapping[case]
def truncate_cell(self, cell_value):
# do not process on these cases
if isinstance(cell_value, int) or isinstance(cell_value, float):
return cell_value
if cell_value.strip() != "":
try_tokens = self.tokenize(cell_value)
if len(try_tokens) >= self.max_cell_length:
retain_tokens = try_tokens[: self.max_cell_length]
retain_cell_value = self.convert_tokens_to_string(retain_tokens)
return retain_cell_value
else:
return None
else:
return cell_value
def truncate_table_rows(
self, table_content: Dict, question: str, answer: Optional[Union[str, List[str]]] = None, max_length=None
):
"""
Args:
table_content:
{"header": xxx, "rows": xxx, "id" (Optionally): xxx}
question:
natural language sentence
answer:
if for training, is the supervision; otherwise will be empty
"""
delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length)
# randomly delete unrelated rows
self.delete_unrelated_rows(table_content, question, answer, delete_ratio)
# guarantee the result < max_length
maximum_keep_rows = 0
for ind, row_example in enumerate(table_content["rows"]):
value_string = self.table_linearize.process_row(row_example, ind + 1)
value_token_len = len(self.tokenize(value_string))
# over the size limit, and take action
if value_token_len > remain_token_len:
break
remain_token_len -= value_token_len
maximum_keep_rows += 1
del table_content["rows"][maximum_keep_rows:]
def estimate_delete_ratio(self, table_content: Dict, question: str, max_length=None):
if "header" not in table_content or "rows" not in table_content:
raise ValueError("The table content should contain both 'header' and 'rows' keys.")
# calculate the tokens of header, special tokens will only be pre-prepended into question
question_tokens = self.tokenize(question, add_special_tokens=True)
# calculate the tokens of header
header_string = self.table_linearize.process_header(table_content["header"])
header_tokens = self.tokenize(header_string, add_special_tokens=False)
# split all cell values into tokens and see how many can be accommodated
used_token_len = len(question_tokens) + len(header_tokens)
# remaining token space for rows
remain_token_len = max_length - used_token_len
value_string = ""
for _, row_example in enumerate(table_content["rows"]):
# use a general index to roughly estimate the overall token len
value_string += self.table_linearize.process_row(row_example, 100) + " "
value_token_len = len(self.tokenize(value_string))
if value_token_len < remain_token_len:
# no row will be deleted
return 0.0, remain_token_len
else:
# calc a roughly delete rate
return 1.0 - remain_token_len / value_token_len, remain_token_len
def delete_unrelated_rows(self, table_content: Dict, question: str, answer: List, delete_ratio: float):
"""
The argument answer is used only during training.
"""
truncated_unrelated_indices = []
related_indices = []
if answer is None or len(answer) == 0:
answer_set = set([])
else:
answer_set = set([ans_ex.lower() for ans_ex in answer])
# add question key words into answer set
if question is not None:
answer_set.update(question.split())
question_set = set(question.strip("?!.,").split(" "))
row_max_len = len(table_content["rows"])
for _row_idx, row in enumerate(table_content["rows"]):
lower_row = set([str(cell).lower() for cell in row])
if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
truncated_unrelated_indices.append(_row_idx)
else:
# add neighbours to preserve information aggressively
related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2])
# remove the neighbours
truncated_unrelated_indices = [
_row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices
]
# select some cases to drop
drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio))
drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items)
for _row_idx in reversed(range(row_max_len)):
if _row_idx in drop_row_indices:
del table_content["rows"][_row_idx]
# only when the drop ratio is too large, logging for warning.
if "id" in table_content and len(drop_row_indices) > 0:
logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"]))
# coding=utf-8
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from typing import List
import pandas as pd
from transformers import AddedToken, TapexTokenizer
from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
from ..test_tokenization_common import TokenizerTesterMixin
@require_pandas
class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = TapexTokenizer
test_rust_tokenizer = False
from_pretrained_kwargs = {"cls_token": "<s>"}
test_seq2seq = False
def setUp(self):
super().setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
# fmt: off
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"] # noqa: E231
# fmt: on
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_table(self, tokenizer, length=5):
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
if length == 0:
data = {}
else:
data = {toks[0]: [toks[tok] for tok in range(1, length)]}
table = pd.DataFrame.from_dict(data)
return table
def get_table_and_query(self, tokenizer, length=5):
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
table = self.get_table(tokenizer, length=length - 3)
query = " ".join(toks[:3])
return table, query
def get_clean_sequence(
self,
tokenizer,
with_prefix_space=False,
max_length=20,
min_length=5,
empty_table: bool = False,
add_special_tokens: bool = True,
return_table_and_query: bool = False,
):
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
if empty_table:
table = pd.DataFrame.from_dict({})
query = " ".join(toks[:min_length])
else:
data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
table = pd.DataFrame.from_dict(data)
query = " ".join(toks[:3])
output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
output_txt = tokenizer.decode(output_ids)
if len(output_ids) < min_length:
raise ValueError("Update the code to generate the sequences so that they are larger")
if len(output_ids) > max_length:
raise ValueError("Update the code to generate the sequences so that they are smaller")
if return_table_and_query:
return output_txt, output_ids, table, query
return output_txt, output_ids
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
return input_text, output_text
def test_full_tokenizer_roberta(self):
tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer"
bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def roberta_dict_integration_testing(self):
tokenizer = self.get_tokenizer()
self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
self.assertListEqual(
tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
)
def test_add_tokens_tokenizer(self):
tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
vocab_size = tokenizer.vocab_size
all_size = len(tokenizer)
self.assertNotEqual(vocab_size, 0)
# We usually have added tokens from the start in tests because our vocab fixtures are
# smaller than the original vocabs - let's not assert this
# self.assertEqual(vocab_size, all_size)
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)
self.assertNotEqual(vocab_size_2, 0)
self.assertEqual(vocab_size, vocab_size_2)
self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
self.assertGreaterEqual(len(tokens), 4)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
self.assertNotEqual(vocab_size_3, 0)
self.assertEqual(vocab_size, vocab_size_3)
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode(
table,
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
add_special_tokens=False,
)
self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1])
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-2], tokenizer.pad_token_id)
def test_token_type_ids(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
empty_table = self.get_table(tokenizer, length=0)
seq_0 = "Test this method."
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
# (regardless of whether the model use token type ids)
# We use this assumption in the QA pipeline among other place
output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
# Assert that the token type IDs have the same length as the input IDs
self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
self.assertIn(0, output["token_type_ids"])
def test_add_special_tokens(self):
tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
input_table = self.get_table(tokenizer, length=0)
special_token = "[SPECIAL_TOKEN]"
tokenizer.add_special_tokens({"cls_token": special_token})
encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
self.assertEqual(len(encoded_special_token), 1)
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
self.assertTrue(special_token not in decoded)
def test_batch_encode_plus_overflowing_tokens(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
table = self.get_table(tokenizer, length=10)
string_sequences = ["Testing the prepare_for_model method.", "Test"]
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.batch_encode_plus(
table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
)
@is_pt_tf_cross_test
def test_batch_encode_plus_tensors(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
table = self.get_table(tokenizer, length=0)
# A Tensor cannot be build by sequences which are not the same size
self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
if tokenizer.pad_token_id is None:
self.assertRaises(
ValueError,
tokenizer.batch_encode_plus,
table,
sequences,
padding=True,
return_tensors="pt",
)
self.assertRaises(
ValueError,
tokenizer.batch_encode_plus,
table,
sequences,
padding="longest",
return_tensors="tf",
)
else:
pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
tensorflow_tensor = tokenizer.batch_encode_plus(
table, sequences, padding="longest", return_tensors="tf"
)
encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
for key in encoded_sequences.keys():
pytorch_value = pytorch_tensor[key].tolist()
tensorflow_value = tensorflow_tensor[key].numpy().tolist()
encoded_value = encoded_sequences[key]
self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
# Test not batched
table = self.get_table(tokenizer, length=0)
encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
encoded_sequences_2 = tokenizer(table, sequences[0])
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
# Test not batched pairs
table = self.get_table(tokenizer, length=10)
encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
encoded_sequences_2 = tokenizer(table, sequences[1])
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
# Test batched
table = self.get_table(tokenizer, length=0)
encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
encoded_sequences_2 = tokenizer(table, sequences)
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
def test_internal_consistency(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
input_text, output_text = self.get_input_output_texts(tokenizer)
tokens = tokenizer.tokenize(input_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
self.assertListEqual(ids, ids_2)
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
self.assertNotEqual(len(tokens_2), 0)
text_2 = tokenizer.decode(ids)
self.assertIsInstance(text_2, str)
self.assertEqual(text_2, output_text)
def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
self.assertNotEqual(tokenizer.model_max_length, 42)
# Now let's start the test
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
# Isolate this from the other tests because we save additional tokens/etc
table = self.get_table(tokenizer, length=0)
tmpdirname = tempfile.mkdtemp()
sample_text = " He is very happy, UNwant\u00E9d,running"
before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
before_vocab = tokenizer.get_vocab()
tokenizer.save_pretrained(tmpdirname)
after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
after_vocab = after_tokenizer.get_vocab()
self.assertListEqual(before_tokens, after_tokens)
self.assertDictEqual(before_vocab, after_vocab)
shutil.rmtree(tmpdirname)
def test_number_of_added_tokens(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table, query = self.get_table_and_query(tokenizer)
sequences = tokenizer.encode(table, query, add_special_tokens=False)
attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
self.assertEqual(2, len(attached_sequences) - len(sequences))
@unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
def test_prepare_for_model(self):
pass
@unittest.skip("TAPEX tokenizer does not support pairs.")
def test_maximum_encoding_length_pair_input(self):
pass
@unittest.skip("TAPEX tokenizer does not support pairs.")
def test_maximum_encoding_length_single_input(self):
pass
@unittest.skip("Not implemented")
def test_right_and_left_truncation(self):
pass
def test_encode_decode_with_spaces(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
tokenizer.add_tokens(new_toks)
input = "[ABC][DEF][ABC][DEF]"
if self.space_between_special_tokens:
output = "[ABC] [DEF] [ABC] [DEF]"
else:
output = input
encoded = tokenizer.encode(table, input, add_special_tokens=False)
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertIn(decoded, [output, output.lower()])
def test_tokenize_special_tokens(self):
"""Test `tokenize` with special tokens."""
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
# TODO:
# Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
# with one variable(property) for a better maintainability?
# `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
# `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
# which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
self.assertEqual(len(token_1), 1)
self.assertEqual(len(token_2), 1)
self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
def test_special_tokens_mask(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
sequence_0 = "Encode this."
# Testing single inputs
encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(
table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
self.assertEqual(encoded_sequence, filtered_sequence)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer)
sequence = "Sequence"
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequence)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(table, sequence)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(
table,
sequence,
max_length=sequence_length + padding_size,
pad_to_max_length=True,
)
padded_sequence_length = len(padded_sequence)
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(table, sequence)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
self.assertEqual(sequence_length, padded_sequence_right_length)
self.assertListEqual(encoded_sequence, padded_sequence_right)
def test_padding_to_multiple_of(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
if tokenizer.pad_token is None:
self.skipTest("No padding token.")
else:
empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
for key, value in empty_tokens.items():
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
for key, value in normal_tokens.items():
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
for key, value in normal_tokens.items():
self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
# Should also work with truncation
normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
for key, value in normal_tokens.items():
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
def test_right_and_left_padding(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
sequence = "Sequence"
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequence)
padding_idx = tokenizer.pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(table, sequence)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(
table, sequence, max_length=sequence_length + padding_size, padding="max_length"
)
padded_sequence_length = len(padded_sequence)
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "left"
encoded_sequence = tokenizer.encode(table, sequence)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(
table, sequence, max_length=sequence_length + padding_size, padding="max_length"
)
padded_sequence_length = len(padded_sequence)
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
encoded_sequence = tokenizer.encode(table, sequence)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
padded_sequence_right_length = len(padded_sequence_right)
self.assertEqual(sequence_length, padded_sequence_right_length)
self.assertListEqual(encoded_sequence, padded_sequence_right)
tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
padded_sequence_left_length = len(padded_sequence_left)
self.assertEqual(sequence_length, padded_sequence_left_length)
self.assertListEqual(encoded_sequence, padded_sequence_left)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(table, sequence)
padded_sequence_right_length = len(padded_sequence_right)
self.assertEqual(sequence_length, padded_sequence_right_length)
self.assertListEqual(encoded_sequence, padded_sequence_right)
tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
padded_sequence_left_length = len(padded_sequence_left)
self.assertEqual(sequence_length, padded_sequence_left_length)
self.assertListEqual(encoded_sequence, padded_sequence_left)
def test_encode_plus_with_padding(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
sequence = "Sequence"
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequence)
padding_size = 10
padding_idx = tokenizer.pad_token_id
token_type_padding_idx = tokenizer.pad_token_type_id
encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence["input_ids"]
special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids)
# Test 'longest' and 'no_padding' don't do anything
tokenizer.padding_side = "right"
not_padded_sequence = tokenizer.encode_plus(
table,
sequence,
padding=False,
return_special_tokens_mask=True,
)
not_padded_input_ids = not_padded_sequence["input_ids"]
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
not_padded_sequence_length = len(not_padded_input_ids)
self.assertEqual(sequence_length, not_padded_sequence_length)
self.assertListEqual(input_ids, not_padded_input_ids)
self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
not_padded_sequence = tokenizer.encode_plus(
table,
sequence,
padding=False,
return_special_tokens_mask=True,
)
not_padded_input_ids = not_padded_sequence["input_ids"]
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
not_padded_sequence_length = len(not_padded_input_ids)
self.assertEqual(sequence_length, not_padded_sequence_length)
self.assertListEqual(input_ids, not_padded_input_ids)
self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
right_padded_sequence = tokenizer.encode_plus(
table,
sequence,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
right_padded_sequence_length = len(right_padded_input_ids)
self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
table,
sequence,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
if "token_type_ids" in tokenizer.model_input_names:
token_type_ids = encoded_sequence["token_type_ids"]
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
self.assertListEqual(
(token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
)
self.assertListEqual(
[[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
)
if "attention_mask" in tokenizer.model_input_names:
attention_mask = encoded_sequence["attention_mask"]
right_padded_attention_mask = right_padded_sequence["attention_mask"]
left_padded_attention_mask = left_padded_sequence["attention_mask"]
self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
def test_batch_encode_plus_padding(self):
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
# Right padding tests
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequences)
encoded_sequences = [
tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
for sequence in sequences
]
encoded_sequences_batch = tokenizer.batch_encode_plus(
table, sequences, max_length=max_length, padding="max_length"
)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
# Left padding tests
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
tokenizer.padding_side = "left"
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequences)
encoded_sequences = [
tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
for sequence in sequences
]
encoded_sequences_batch = tokenizer.batch_encode_plus(
table, sequences, max_length=max_length, padding="max_length"
)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
def test_batch_encode_plus_batch_sequence_length(self):
# Tests that all encoded values have the correct size
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer, length=0)
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
maximum_length = len(
max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
)
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequences)
encoded_sequences_padded = [
tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
for sequence in sequences
]
encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
self.assertListEqual(
encoded_sequences_padded,
self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
)
# check 'longest' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
table, sequences, max_length=maximum_length + 10, padding="longest"
)
for key in encoded_sequences_batch_padded_1.keys():
self.assertListEqual(
encoded_sequences_batch_padded_1[key],
encoded_sequences_batch_padded_2[key],
)
# check 'no_padding' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
table, sequences, max_length=maximum_length + 10, padding=False
)
for key in encoded_sequences_batch_padded_1.keys():
self.assertListEqual(
encoded_sequences_batch_padded_1[key],
encoded_sequences_batch_padded_2[key],
)
def test_special_tokens_mask_input_pairs(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
sequence_0 = "Encode this."
empty_table = self.get_table(tokenizer, length=0)
table = self.get_table(tokenizer, length=10)
encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
number_of_tokens = len(encoded_sequence)
encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(
table,
sequence_0,
add_special_tokens=True,
return_special_tokens_mask=True,
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
]
# NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
# in order to have equivalent results with encoding an empty table or empty sequence
del filtered_sequence[number_of_tokens + 1]
filtered_sequence = [x for x in filtered_sequence if x is not None]
print("Encoded sequence:", encoded_sequence)
print("Filtered sequence:", filtered_sequence)
self.assertEqual(encoded_sequence, filtered_sequence)
@slow
def test_full_tokenizer(self):
question = "Greece held its last Summer Olympics in 2004"
table_dict = {
"header": ["Year", "City", "Country", "Nations"],
"rows": [
[1896, "Athens", "Greece", 14],
[1900, "Paris", "France", 24],
[1904, "St. Louis", "USA", 12],
[2004, "Athens", "Greece", 201],
[2008, "Beijing", "China", 204],
[2012, "London", "UK", 204],
],
}
table = pd.DataFrame.from_dict(table_dict["rows"])
table.columns = table_dict["header"]
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
encoding = tokenizer(table, question)
# fmt: off
expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
# fmt: on
self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
def test_tokenizer_as_target(self):
# by default the tokenizer do_lower_case
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
answer_text = "tapex is a good model!"
expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
with tokenizer.as_target_tokenizer():
answer_encoding = tokenizer(answer=answer_text)
self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
@slow
def test_tokenizer_lower_case(self):
cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
answer_text = "Beijing, London, Paris"
answer_text_lower = "beijing, london, paris"
with cased_tokenizer.as_target_tokenizer():
with uncased_tokenizer.as_target_tokenizer():
self.assertNotEqual(
cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
)
self.assertEqual(
cased_tokenizer(answer=answer_text_lower).input_ids,
uncased_tokenizer(answer=answer_text).input_ids,
)
# batched encoding assert
self.assertNotEqual(
cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
)
self.assertEqual(
cased_tokenizer(answer=[answer_text_lower]).input_ids,
uncased_tokenizer(answer=[answer_text]).input_ids,
)
# test input encoding lowercase
question = "Greece held its last Summer Olympics in 2004"
table_dict = {
"header": ["Year", "City", "Country", "Nations"],
"rows": [
[1896, "Athens", "Greece", 14],
[1900, "Paris", "France", 24],
[1904, "St. Louis", "USA", 12],
[2004, "Athens", "Greece", 201],
[2008, "Beijing", "China", 204],
[2012, "London", "UK", 204],
],
}
table = pd.DataFrame.from_dict(table_dict["rows"])
table.columns = table_dict["header"]
self.assertNotEqual(
cased_tokenizer(table=table, query=question).input_ids,
uncased_tokenizer(table=table, query=question).input_ids,
)
docs/source/en/quicktour.mdx
docs/source/en/task_summary.mdx
docs/source/en/model_doc/speech_to_text.mdx
docs/source/en/model_doc/tapex.mdx
src/transformers/generation_utils.py
src/transformers/models/bart/modeling_bart.py
src/transformers/models/beit/modeling_beit.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment