Unverified Commit f4e04cd2 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests...


[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency (#8073)

* Fixing roberta for slow-fast tests

* WIP getting equivalence on pipelines

* slow-to-fast equivalence - working on question-answering pipeline

* optional FAISS tests

* Pipeline Q&A

* Move pipeline tests to their own test job again

* update tokenizer to add sequence id methods

* update to tokenizers 0.9.4

* set sentencepiecce as optional

* clean up squad

* clean up pipelines to use sequence_ids

* style/quality

* wording

* Switch to use_fast = True by default

* update tests for use_fast at True by default

* fix rag tokenizer test

* removing protobuf from required dependencies

* fix NER test for use_fast = True by default

* fixing example tests (Q&A examples use slow tokenizers for now)

* protobuf in main deps extras["sentencepiece"] and example deps

* fix protobug install test

* try to fix seq2seq by switching to slow tokenizers for now

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
parent 24184e73
...@@ -736,6 +736,7 @@ def main(): ...@@ -736,6 +736,7 @@ def main():
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
do_lower_case=args.do_lower_case, do_lower_case=args.do_lower_case,
cache_dir=args.cache_dir if args.cache_dir else None, cache_dir=args.cache_dir if args.cache_dir else None,
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
) )
model = AutoModelForQuestionAnswering.from_pretrained( model = AutoModelForQuestionAnswering.from_pretrained(
args.model_name_or_path, args.model_name_or_path,
...@@ -784,7 +785,10 @@ def main(): ...@@ -784,7 +785,10 @@ def main():
# Load a trained model and vocabulary that you have fine-tuned # Load a trained model and vocabulary that you have fine-tuned
model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True) model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
# SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
# So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
model.to(args.device) model.to(args.device)
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
......
...@@ -114,6 +114,7 @@ def main(): ...@@ -114,6 +114,7 @@ def main():
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
) )
model = AutoModelForQuestionAnswering.from_pretrained( model = AutoModelForQuestionAnswering.from_pretrained(
model_args.model_name_or_path, model_args.model_name_or_path,
......
...@@ -18,3 +18,4 @@ fire ...@@ -18,3 +18,4 @@ fire
pytest pytest
conllu conllu
sentencepiece != 0.1.92 sentencepiece != 0.1.92
protobuf
...@@ -197,7 +197,7 @@ class TestAll(TestCasePlus): ...@@ -197,7 +197,7 @@ class TestAll(TestCasePlus):
) )
@require_torch_non_multi_gpu_but_fix_me @require_torch_non_multi_gpu_but_fix_me
def test_dataset_kwargs(self, tok_name): def test_dataset_kwargs(self, tok_name):
tokenizer = AutoTokenizer.from_pretrained(tok_name) tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
if tok_name == MBART_TINY: if tok_name == MBART_TINY:
train_dataset = Seq2SeqDataset( train_dataset = Seq2SeqDataset(
tokenizer, tokenizer,
......
...@@ -96,13 +96,13 @@ else: ...@@ -96,13 +96,13 @@ else:
extras["retrieval"] = ["faiss-cpu", "datasets"] extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["flax"] = ["jaxlib==0.1.55", "jax>=0.2.0", "flax==0.2.2"] extras["flax"] = ["jaxlib==0.1.55", "jax>=0.2.0", "flax==0.2.2"]
extras["tokenizers"] = ["tokenizers==0.9.2"] extras["tokenizers"] = ["tokenizers==0.9.4"]
extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"] extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"]
extras["modelcreation"] = ["cookiecutter==1.7.2"] extras["modelcreation"] = ["cookiecutter==1.7.2"]
extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
extras["sentencepiece"] = ["sentencepiece==0.1.91"] extras["sentencepiece"] = ["sentencepiece==0.1.91", "protobuf"]
extras["retrieval"] = ["faiss-cpu", "datasets"] extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] + extras["modelcreation"] extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] + extras["modelcreation"]
# sphinx-rtd-theme==0.5.0 introduced big changes in the style. # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
...@@ -130,7 +130,7 @@ setup( ...@@ -130,7 +130,7 @@ setup(
packages=find_packages("src"), packages=find_packages("src"),
install_requires=[ install_requires=[
"numpy", "numpy",
"tokenizers == 0.9.3", "tokenizers == 0.9.4",
# dataclasses for Python versions that don't have it # dataclasses for Python versions that don't have it
"dataclasses;python_version<'3.7'", "dataclasses;python_version<'3.7'",
# utilities from PyPA to e.g. compare versions # utilities from PyPA to e.g. compare versions
...@@ -143,9 +143,6 @@ setup( ...@@ -143,9 +143,6 @@ setup(
"tqdm >= 4.27", "tqdm >= 4.27",
# for OpenAI GPT # for OpenAI GPT
"regex != 2019.12.17", "regex != 2019.12.17",
# for SentencePiece models
"sentencepiece == 0.1.91",
"protobuf",
# for XLM # for XLM
"sacremoses", "sacremoses",
], ],
......
...@@ -24,10 +24,7 @@ from typing import Dict, List, Tuple ...@@ -24,10 +24,7 @@ from typing import Dict, List, Tuple
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece from tokenizers.models import BPE, Unigram, WordPiece
# from transformers.tokenization_openai import OpenAIGPTTokenizer from .file_utils import requires_protobuf, requires_sentencepiece
from transformers.utils import sentencepiece_model_pb2 as model
from .file_utils import requires_sentencepiece
class SentencePieceExtractor: class SentencePieceExtractor:
...@@ -64,12 +61,6 @@ def check_number_comma(piece: str) -> bool: ...@@ -64,12 +61,6 @@ def check_number_comma(piece: str) -> bool:
return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit() return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()
def get_proto(filename: str):
m = model.ModelProto()
m.ParseFromString(open(filename, "rb").read())
return m
class Converter: class Converter:
def __init__(self, original_tokenizer): def __init__(self, original_tokenizer):
self.original_tokenizer = original_tokenizer self.original_tokenizer = original_tokenizer
...@@ -292,8 +283,15 @@ class RobertaConverter(Converter): ...@@ -292,8 +283,15 @@ class RobertaConverter(Converter):
class SpmConverter(Converter): class SpmConverter(Converter):
def __init__(self, *args): def __init__(self, *args):
requires_protobuf(self)
super().__init__(*args) super().__init__(*args)
self.proto = get_proto(self.original_tokenizer.vocab_file)
from .utils import sentencepiece_model_pb2 as model_pb2
m = model_pb2.ModelProto()
m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read())
self.proto = m
def vocab(self, proto): def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces] return [(piece.piece, piece.score) for piece in proto.pieces]
......
...@@ -8,7 +8,7 @@ from tqdm import tqdm ...@@ -8,7 +8,7 @@ from tqdm import tqdm
from ...file_utils import is_tf_available, is_torch_available from ...file_utils import is_tf_available, is_torch_available
from ...tokenization_bert import whitespace_tokenize from ...tokenization_bert import whitespace_tokenize
from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
from ...utils import logging from ...utils import logging
from .utils import DataProcessor from .utils import DataProcessor
...@@ -765,6 +765,7 @@ class SquadFeatures: ...@@ -765,6 +765,7 @@ class SquadFeatures:
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
start_position: start of the answer token index start_position: start of the answer token index
end_position: end of the answer token index end_position: end of the answer token index
encoding: optionally store the BatchEncoding with the fast-tokenizer alignement methods.
""" """
def __init__( def __init__(
...@@ -784,6 +785,7 @@ class SquadFeatures: ...@@ -784,6 +785,7 @@ class SquadFeatures:
end_position, end_position,
is_impossible, is_impossible,
qas_id: str = None, qas_id: str = None,
encoding: BatchEncoding = None,
): ):
self.input_ids = input_ids self.input_ids = input_ids
self.attention_mask = attention_mask self.attention_mask = attention_mask
...@@ -803,6 +805,8 @@ class SquadFeatures: ...@@ -803,6 +805,8 @@ class SquadFeatures:
self.is_impossible = is_impossible self.is_impossible = is_impossible
self.qas_id = qas_id self.qas_id = qas_id
self.encoding = encoding
class SquadResult: class SquadResult:
""" """
......
...@@ -185,6 +185,15 @@ except ImportError: ...@@ -185,6 +185,15 @@ except ImportError:
_sentencepiece_available = False _sentencepiece_available = False
try:
import google.protobuf # noqa: F401
_protobuf_available = True
except ImportError:
_protobuf_available = False
try: try:
import tokenizers # noqa: F401 import tokenizers # noqa: F401
...@@ -270,6 +279,10 @@ def is_sentencepiece_available(): ...@@ -270,6 +279,10 @@ def is_sentencepiece_available():
return _sentencepiece_available return _sentencepiece_available
def is_protobuf_available():
return _protobuf_available
def is_tokenizers_available(): def is_tokenizers_available():
return _tokenizers_available return _tokenizers_available
...@@ -330,6 +343,14 @@ that match your environment. ...@@ -330,6 +343,14 @@ that match your environment.
""" """
# docstyle-ignore
PROTOBUF_IMPORT_ERROR = """
{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment.
"""
# docstyle-ignore # docstyle-ignore
FAISS_IMPORT_ERROR = """ FAISS_IMPORT_ERROR = """
{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the {0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
...@@ -420,6 +441,12 @@ def requires_sentencepiece(obj): ...@@ -420,6 +441,12 @@ def requires_sentencepiece(obj):
raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name)) raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))
def requires_protobuf(obj):
name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
if not is_protobuf_available():
raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))
def add_start_docstrings(*docstr): def add_start_docstrings(*docstr):
def docstring_decorator(fn): def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
......
...@@ -32,7 +32,7 @@ import numpy as np ...@@ -32,7 +32,7 @@ import numpy as np
from .configuration_auto import AutoConfig from .configuration_auto import AutoConfig
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .data import SquadExample, squad_convert_examples_to_features from .data import SquadExample, SquadFeatures, squad_convert_examples_to_features
from .file_utils import add_end_docstrings, is_tf_available, is_torch_available from .file_utils import add_end_docstrings, is_tf_available, is_torch_available
from .modelcard import ModelCard from .modelcard import ModelCard
from .tokenization_auto import AutoTokenizer from .tokenization_auto import AutoTokenizer
...@@ -1758,6 +1758,7 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1758,6 +1758,7 @@ class QuestionAnsweringPipeline(Pipeline):
- **answer** (:obj:`str`) -- The answer to the question. - **answer** (:obj:`str`) -- The answer to the question.
""" """
# Set defaults values # Set defaults values
kwargs.setdefault("padding", "longest")
kwargs.setdefault("topk", 1) kwargs.setdefault("topk", 1)
kwargs.setdefault("doc_stride", 128) kwargs.setdefault("doc_stride", 128)
kwargs.setdefault("max_answer_len", 15) kwargs.setdefault("max_answer_len", 15)
...@@ -1773,19 +1774,87 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1773,19 +1774,87 @@ class QuestionAnsweringPipeline(Pipeline):
# Convert inputs to features # Convert inputs to features
examples = self._args_parser(*args, **kwargs) examples = self._args_parser(*args, **kwargs)
features_list = [ if not self.tokenizer.is_fast:
squad_convert_examples_to_features( features_list = [
examples=[example], squad_convert_examples_to_features(
tokenizer=self.tokenizer, examples=[example],
max_seq_length=kwargs["max_seq_len"], tokenizer=self.tokenizer,
doc_stride=kwargs["doc_stride"], max_seq_length=kwargs["max_seq_len"],
max_query_length=kwargs["max_question_len"], doc_stride=kwargs["doc_stride"],
padding_strategy=PaddingStrategy.MAX_LENGTH.value, max_query_length=kwargs["max_question_len"],
is_training=False, padding_strategy=PaddingStrategy.MAX_LENGTH.value,
tqdm_enabled=False, is_training=False,
) tqdm_enabled=False,
for example in examples )
] for example in examples
]
else:
features_list = []
for example in examples:
# Define the side we want to truncate / pad and the text/pair sorting
question_first = bool(self.tokenizer.padding_side == "right")
encoded_inputs = self.tokenizer(
text=example.question_text if question_first else example.context_text,
text_pair=example.context_text if question_first else example.question_text,
padding=kwargs["padding"],
truncation="only_second" if question_first else "only_first",
max_length=kwargs["max_seq_len"],
stride=kwargs["doc_stride"],
return_tensors="np",
return_token_type_ids=True,
return_overflowing_tokens=True,
return_offsets_mapping=True,
return_special_tokens_mask=True,
)
# When the input is too long, it's converted in a batch of inputs with overflowing tokens
# and a stride of overlap between the inputs. If a batch of inputs is given, a special output
# "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
# Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
# "num_span" is the number of output samples generated from the overflowing tokens.
num_spans = len(encoded_inputs["input_ids"])
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
p_mask = np.asarray(
[
[tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
for span_id in range(num_spans)
]
)
# keep the cls_token unmasked (some models use it to indicate unanswerable questions)
if self.tokenizer.cls_token_id:
cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id)
p_mask[cls_index] = 0
features = []
for span_idx in range(num_spans):
features.append(
SquadFeatures(
input_ids=encoded_inputs["input_ids"][span_idx],
attention_mask=encoded_inputs["attention_mask"][span_idx],
token_type_ids=encoded_inputs["token_type_ids"][span_idx],
p_mask=p_mask[span_idx].tolist(),
encoding=encoded_inputs[span_idx],
# We don't use the rest of the values - and actually
# for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
cls_index=None,
token_to_orig_map={},
example_index=0,
unique_id=0,
paragraph_len=0,
token_is_max_context=0,
tokens=[],
start_position=0,
end_position=0,
is_impossible=False,
qas_id=None,
)
)
features_list.append(features)
all_answers = [] all_answers = []
for features, example in zip(features_list, examples): for features, example in zip(features_list, examples):
model_input_names = self.tokenizer.model_input_names + ["input_ids"] model_input_names = self.tokenizer.model_input_names + ["input_ids"]
...@@ -1828,20 +1897,56 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -1828,20 +1897,56 @@ class QuestionAnsweringPipeline(Pipeline):
start_[0] = end_[0] = 0.0 start_[0] = end_[0] = 0.0
starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
char_to_word = np.array(example.char_to_word_offset) if not self.tokenizer.is_fast:
char_to_word = np.array(example.char_to_word_offset)
# Convert the answer (tokens) back to the original text
answers += [ # Convert the answer (tokens) back to the original text
{ # Score: score from the model
"score": score.item(), # Start: Index of the first character of the answer in the context string
"start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), # End: Index of the character following the last character of the answer in the context string
"end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), # Answer: Plain text of the answer
"answer": " ".join( answers += [
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] {
), "score": score.item(),
} "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
for s, e, score in zip(starts, ends, scores) "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
] "answer": " ".join(
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
),
}
for s, e, score in zip(starts, ends, scores)
]
else:
# Convert the answer (tokens) back to the original text
# Score: score from the model
# Start: Index of the first character of the answer in the context string
# End: Index of the character following the last character of the answer in the context string
# Answer: Plain text of the answer
question_first = bool(self.tokenizer.padding_side == "right")
enc = feature.encoding
# Sometimes the max probability token is in the middle of a word so:
# - we start by finding the right word containing the token with `token_to_word`
# - then we convert this word in a character span with `word_to_chars`
answers += [
{
"score": score.item(),
"start": enc.word_to_chars(
enc.token_to_word(s), sequence_index=1 if question_first else 0
)[0],
"end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[
1
],
"answer": example.context_text[
enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if question_first else 0)[
0
] : enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[
1
]
],
}
for s, e, score in zip(starts, ends, scores)
]
if kwargs["handle_impossible_answer"]: if kwargs["handle_impossible_answer"]:
answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
...@@ -2735,7 +2840,7 @@ def pipeline( ...@@ -2735,7 +2840,7 @@ def pipeline(
tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
revision: Optional[str] = None, revision: Optional[str] = None,
use_fast: bool = False, use_fast: bool = True,
**kwargs **kwargs
) -> Pipeline: ) -> Pipeline:
""" """
...@@ -2795,7 +2900,7 @@ def pipeline( ...@@ -2795,7 +2900,7 @@ def pipeline(
When passing a task name or a string model identifier: The specific model version to use. It can be a When passing a task name or a string model identifier: The specific model version to use. It can be a
branch name, a tag name, or a commit id, since we use a git-based system for storing models and other branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`): use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`). Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
kwargs: kwargs:
Additional keyword arguments passed along to the specific pipeline init (see the documentation for the Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
......
...@@ -280,7 +280,7 @@ class AutoTokenizer: ...@@ -280,7 +280,7 @@ class AutoTokenizer:
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
identifier allowed by git. identifier allowed by git.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`): use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to try to load the fast version of the tokenizer. Whether or not to try to load the fast version of the tokenizer.
kwargs (additional keyword arguments, `optional`): kwargs (additional keyword arguments, `optional`):
Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like
...@@ -308,7 +308,7 @@ class AutoTokenizer: ...@@ -308,7 +308,7 @@ class AutoTokenizer:
if "bert-base-japanese" in str(pretrained_model_name_or_path): if "bert-base-japanese" in str(pretrained_model_name_or_path):
return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
use_fast = kwargs.pop("use_fast", False) use_fast = kwargs.pop("use_fast", True)
if config.tokenizer_class is not None: if config.tokenizer_class is not None:
if use_fast and not config.tokenizer_class.endswith("Fast"): if use_fast and not config.tokenizer_class.endswith("Fast"):
......
...@@ -18,6 +18,7 @@ from typing import List, Optional ...@@ -18,6 +18,7 @@ from typing import List, Optional
from .tokenization_gpt2_fast import GPT2TokenizerFast from .tokenization_gpt2_fast import GPT2TokenizerFast
from .tokenization_roberta import RobertaTokenizer from .tokenization_roberta import RobertaTokenizer
from .tokenization_utils_base import AddedToken
from .utils import logging from .utils import logging
...@@ -172,6 +173,32 @@ class RobertaTokenizerFast(GPT2TokenizerFast): ...@@ -172,6 +173,32 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
**kwargs, **kwargs,
) )
@property
def mask_token(self) -> str:
"""
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
comprise the space before the `<mask>`.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
@mask_token.setter
def mask_token(self, value):
"""
Overriding the default behavior of the mask token to have it eat the space before it.
This is needed to preserve backward compatibility with all the previously used models based on Roberta.
"""
# Mask token behave like a normal word, i.e. include the space before it
# So we set lstrip to True
value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
self._mask_token = value
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None: if token_ids_1 is None:
......
...@@ -182,7 +182,9 @@ def to_py_obj(obj): ...@@ -182,7 +182,9 @@ def to_py_obj(obj):
""" """
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list. Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
""" """
if isinstance(obj, (list, tuple)): if isinstance(obj, (dict, BatchEncoding)):
return {k: to_py_obj(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj] return [to_py_obj(o) for o in obj]
elif is_tf_available() and isinstance(obj, tf.Tensor): elif is_tf_available() and isinstance(obj, tf.Tensor):
return obj.numpy().tolist() return obj.numpy().tolist()
...@@ -216,6 +218,9 @@ class BatchEncoding(UserDict): ...@@ -216,6 +218,9 @@ class BatchEncoding(UserDict):
initialization. initialization.
prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above). Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
n_sequences (:obj:`Optional[int]`, `optional`):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization.
""" """
def __init__( def __init__(
...@@ -224,6 +229,7 @@ class BatchEncoding(UserDict): ...@@ -224,6 +229,7 @@ class BatchEncoding(UserDict):
encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
tensor_type: Union[None, str, TensorType] = None, tensor_type: Union[None, str, TensorType] = None,
prepend_batch_axis: bool = False, prepend_batch_axis: bool = False,
n_sequences: Optional[int] = None,
): ):
super().__init__(data) super().__init__(data)
...@@ -232,8 +238,22 @@ class BatchEncoding(UserDict): ...@@ -232,8 +238,22 @@ class BatchEncoding(UserDict):
self._encodings = encoding self._encodings = encoding
if n_sequences is None and encoding is not None and len(encoding):
n_sequences = encoding[0].n_sequences
self._n_sequences = n_sequences
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
@property
def n_sequences(self) -> Optional[int]:
"""
:obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
:class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
sentence) or :obj:`2` (a pair of sentences)
"""
return self.n_sequences
@property @property
def is_fast(self) -> bool: def is_fast(self) -> bool:
""" """
...@@ -311,6 +331,27 @@ class BatchEncoding(UserDict): ...@@ -311,6 +331,27 @@ class BatchEncoding(UserDict):
raise ValueError("tokens() is not available when using Python-based tokenizers") raise ValueError("tokens() is not available when using Python-based tokenizers")
return self._encodings[batch_index].tokens return self._encodings[batch_index].tokens
def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to the id of their original sentences:
- :obj:`None` for special tokens added around or between sequences,
- :obj:`0` for tokens corresponding to words in the first sequence,
- :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
encoded.
Args:
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
Returns:
:obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
corresponding sequence.
"""
if not self._encodings:
raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
return self._encodings[batch_index].sequence_ids
def words(self, batch_index: int = 0) -> List[Optional[int]]: def words(self, batch_index: int = 0) -> List[Optional[int]]:
""" """
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
...@@ -325,7 +366,67 @@ class BatchEncoding(UserDict): ...@@ -325,7 +366,67 @@ class BatchEncoding(UserDict):
""" """
if not self._encodings: if not self._encodings:
raise ValueError("words() is not available when using Python-based tokenizers") raise ValueError("words() is not available when using Python-based tokenizers")
return self._encodings[batch_index].words warnings.warn(
"`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
"but more self-explanatory `BatchEncoding.word_ids()` property.",
FutureWarning,
)
return self.word_ids(batch_index)
def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
Args:
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
Returns:
:obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
word (several tokens will be mapped to the same word index if they are parts of that word).
"""
if not self._encodings:
raise ValueError("word_ids() is not available when using Python-based tokenizers")
return self._encodings[batch_index].word_ids
def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
"""
Get the index of the sequence represented by the given token. In the general use case, this method returns
:obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair
Can be called as:
- ``self.token_to_sequence(token_index)`` if batch size is 1
- ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
tokenized words.
Args:
batch_or_token_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
the token in the sequence.
token_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
sequence.
Returns:
:obj:`int`: Index of the word in the input sequence.
"""
if not self._encodings:
raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
if token_index is not None:
batch_index = batch_or_token_index
else:
batch_index = 0
token_index = batch_or_token_index
if batch_index < 0:
batch_index = self._batch_size + batch_index
if token_index < 0:
token_index = self._seq_len + token_index
return self._encodings[batch_index].token_to_sequence(token_index)
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
""" """
...@@ -365,9 +466,11 @@ class BatchEncoding(UserDict): ...@@ -365,9 +466,11 @@ class BatchEncoding(UserDict):
token_index = self._seq_len + token_index token_index = self._seq_len + token_index
return self._encodings[batch_index].token_to_word(token_index) return self._encodings[batch_index].token_to_word(token_index)
def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> Optional[TokenSpan]: def word_to_tokens(
self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
) -> Optional[TokenSpan]:
""" """
Get the encoded token span corresponding to a word in the sequence of the batch. Get the encoded token span corresponding to a word in a sequence of the batch.
Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with: Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
...@@ -376,8 +479,9 @@ class BatchEncoding(UserDict): ...@@ -376,8 +479,9 @@ class BatchEncoding(UserDict):
Can be called as: Can be called as:
- ``self.word_to_tokens(word_index)`` if batch size is 1 - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1 - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
to 1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
...@@ -390,6 +494,9 @@ class BatchEncoding(UserDict): ...@@ -390,6 +494,9 @@ class BatchEncoding(UserDict):
word_index (:obj:`int`, `optional`): word_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence. sequence.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided word index belongs to.
Returns: Returns:
Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence. Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
...@@ -407,7 +514,7 @@ class BatchEncoding(UserDict): ...@@ -407,7 +514,7 @@ class BatchEncoding(UserDict):
batch_index = self._batch_size + batch_index batch_index = self._batch_size + batch_index
if word_index < 0: if word_index < 0:
word_index = self._seq_len + word_index word_index = self._seq_len + word_index
span = self._encodings[batch_index].word_to_tokens(word_index) span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
return TokenSpan(*span) if span is not None else None return TokenSpan(*span) if span is not None else None
def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
...@@ -446,7 +553,9 @@ class BatchEncoding(UserDict): ...@@ -446,7 +553,9 @@ class BatchEncoding(UserDict):
token_index = batch_or_token_index token_index = batch_or_token_index
return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: def char_to_token(
self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
) -> int:
""" """
Get the index of the token in the encoded output comprising a character in the original string for a sequence Get the index of the token in the encoded output comprising a character in the original string for a sequence
of the batch. of the batch.
...@@ -467,6 +576,9 @@ class BatchEncoding(UserDict): ...@@ -467,6 +576,9 @@ class BatchEncoding(UserDict):
char_index (:obj:`int`, `optional`): char_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence. sequence.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided character index belongs to.
Returns: Returns:
...@@ -480,9 +592,11 @@ class BatchEncoding(UserDict): ...@@ -480,9 +592,11 @@ class BatchEncoding(UserDict):
else: else:
batch_index = 0 batch_index = 0
char_index = batch_or_char_index char_index = batch_or_char_index
return self._encodings[batch_index].char_to_token(char_index) return self._encodings[batch_index].char_to_token(char_index, sequence_index)
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: def word_to_chars(
self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
) -> CharSpan:
""" """
Get the character span in the original string corresponding to given word in a sequence of the batch. Get the character span in the original string corresponding to given word in a sequence of the batch.
...@@ -503,6 +617,9 @@ class BatchEncoding(UserDict): ...@@ -503,6 +617,9 @@ class BatchEncoding(UserDict):
word_index (:obj:`int`, `optional`): word_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence. sequence.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided word index belongs to.
Returns: Returns:
:obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string. :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
...@@ -520,9 +637,9 @@ class BatchEncoding(UserDict): ...@@ -520,9 +637,9 @@ class BatchEncoding(UserDict):
else: else:
batch_index = 0 batch_index = 0
word_index = batch_or_word_index word_index = batch_or_word_index
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
""" """
Get the word in the original string corresponding to a character in the original string of a sequence of the Get the word in the original string corresponding to a character in the original string of a sequence of the
batch. batch.
...@@ -543,6 +660,9 @@ class BatchEncoding(UserDict): ...@@ -543,6 +660,9 @@ class BatchEncoding(UserDict):
char_index (:obj:`int`, `optional`): char_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
original string. original string.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided character index belongs to.
Returns: Returns:
...@@ -556,7 +676,7 @@ class BatchEncoding(UserDict): ...@@ -556,7 +676,7 @@ class BatchEncoding(UserDict):
else: else:
batch_index = 0 batch_index = 0
char_index = batch_or_char_index char_index = batch_or_char_index
return self._encodings[batch_index].char_to_word(char_index) return self._encodings[batch_index].char_to_word(char_index, sequence_index)
def convert_to_tensors( def convert_to_tensors(
self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
...@@ -1872,6 +1992,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1872,6 +1992,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format." "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
) )
save_directory = str(save_directory)
added_tokens_file = os.path.join( added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
) )
......
...@@ -169,9 +169,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -169,9 +169,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
return_offsets_mapping: bool = False, return_offsets_mapping: bool = False,
return_length: bool = False, return_length: bool = False,
verbose: bool = True, verbose: bool = True,
) -> Dict[str, Any]: ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
""" """
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict. Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
of encodings, take care of building a batch from overflowing tokens.
Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
lists (overflows) of lists (tokens). lists (overflows) of lists (tokens).
...@@ -203,7 +204,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -203,7 +204,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
if return_length: if return_length:
encoding_dict["length"].append(len(e.ids)) encoding_dict["length"].append(len(e.ids))
return encoding_dict return encoding_dict, encodings
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
""" """
...@@ -390,9 +391,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -390,9 +391,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
) )
# Convert encoding to dict # Convert encoding to dict
# `Tokens` has type: List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]] # `Tokens` has type: Tuple[
# List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
# List[EncodingFast]
# ]
# with nested dimensions corresponding to batch, overflows, sequence length # with nested dimensions corresponding to batch, overflows, sequence length
tokens = [ tokens_and_encodings = [
self._convert_encoding( self._convert_encoding(
encoding=encoding, encoding=encoding,
return_token_type_ids=return_token_type_ids, return_token_type_ids=return_token_type_ids,
...@@ -406,22 +410,27 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -406,22 +410,27 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
for encoding in encodings for encoding in encodings
] ]
# Convert the output to have dict[list] from list[dict] # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
sanitized = {} # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
for key in tokens[0].keys(): # (we say ~ because the number of overflow varies with the example in the batch)
# To List[List[List[int]]] of shape (batch, overflows, sequence length) #
stack = [e for item in tokens for e in item[key]] # To match each overflowing sample with the original sample in the batch
sanitized[key] = stack # we add an overflow_to_sample_mapping array (see below)
sanitized_tokens = {}
for key in tokens_and_encodings[0][0].keys():
stack = [e for item, _ in tokens_and_encodings for e in item[key]]
sanitized_tokens[key] = stack
sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
# If returning overflowing tokens, we need to return a mapping # If returning overflowing tokens, we need to return a mapping
# from the batch idx to the original sample # from the batch idx to the original sample
if return_overflowing_tokens: if return_overflowing_tokens:
overflow_to_sample_mapping = [] overflow_to_sample_mapping = []
for i, enc in enumerate(tokens): for i, (toks, _) in enumerate(tokens_and_encodings):
overflow_to_sample_mapping += [i] * len(enc["input_ids"]) overflow_to_sample_mapping += [i] * len(toks["input_ids"])
sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
return BatchEncoding(sanitized, encodings, tensor_type=return_tensors) return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
def _encode_plus( def _encode_plus(
self, self,
...@@ -518,6 +527,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -518,6 +527,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained`
""" """
save_directory = str(save_directory)
if legacy_format: if legacy_format:
added_tokens_file = os.path.join( added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
......
from typing import List, Optional from typing import List, Optional
from unittest import mock
from transformers import is_tf_available, is_torch_available, pipeline from transformers import is_tf_available, is_torch_available, pipeline
# from transformers.pipelines import DefaultArgumentHandler, Pipeline
from transformers.pipelines import Pipeline from transformers.pipelines import Pipeline
from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
from transformers.tokenization_utils_base import to_py_obj
VALID_INPUTS = ["A simple string", ["list of strings"]] VALID_INPUTS = ["A simple string", ["list of strings"]]
...@@ -13,9 +13,11 @@ VALID_INPUTS = ["A simple string", ["list of strings"]] ...@@ -13,9 +13,11 @@ VALID_INPUTS = ["A simple string", ["list of strings"]]
@is_pipeline_test @is_pipeline_test
class CustomInputPipelineCommonMixin: class CustomInputPipelineCommonMixin:
pipeline_task = None pipeline_task = None
pipeline_loading_kwargs = {} pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with
small_models = None # Models tested without the @slow decorator pipeline_running_kwargs = {} # Additional kwargs to run the pipeline with
large_models = None # Models tested with the @slow decorator small_models = [] # Models tested without the @slow decorator
large_models = [] # Models tested with the @slow decorator
valid_inputs = VALID_INPUTS # Some inputs which are valid to compare fast and slow tokenizers
def setUp(self) -> None: def setUp(self) -> None:
if not is_tf_available() and not is_torch_available(): if not is_tf_available() and not is_torch_available():
...@@ -47,73 +49,11 @@ class CustomInputPipelineCommonMixin: ...@@ -47,73 +49,11 @@ class CustomInputPipelineCommonMixin:
@require_torch @require_torch
@slow @slow
def test_pt_defaults(self): def test_pt_defaults(self):
pipeline(self.pipeline_task, framework="pt")
@require_tf
@slow
def test_tf_defaults(self):
pipeline(self.pipeline_task, framework="tf")
@require_torch
def test_torch_small(self):
for model_name in self.small_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
self._test_pipeline(nlp)
@require_tf
def test_tf_small(self):
for model_name in self.small_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
self._test_pipeline(nlp)
@require_torch
@slow
def test_torch_large(self):
for model_name in self.large_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
self._test_pipeline(nlp)
@require_tf
@slow
def test_tf_large(self):
for model_name in self.large_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
self._test_pipeline(nlp)
def _test_pipeline(self, nlp: Pipeline):
raise NotImplementedError
@is_pipeline_test
class MonoInputPipelineCommonMixin:
pipeline_task = None
pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with
pipeline_running_kwargs = {} # Additional kwargs to run the pipeline with
small_models = [] # Models tested without the @slow decorator
large_models = [] # Models tested with the @slow decorator
mandatory_keys = {} # Keys which should be in the output
valid_inputs = VALID_INPUTS # inputs which are valid
invalid_inputs = [None] # inputs which are not allowed
expected_multi_result: Optional[List] = None
expected_check_keys: Optional[List[str]] = None
def setUp(self) -> None:
if not is_tf_available() and not is_torch_available():
return # Currently no JAX pipelines
for model_name in self.small_models:
pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
for model_name in self.large_models:
pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
@require_torch
@slow
def test_pt_defaults_loads(self):
pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs) pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs)
@require_tf @require_tf
@slow @slow
def test_tf_defaults_loads(self): def test_tf_defaults(self):
pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs) pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs)
@require_torch @require_torch
...@@ -166,6 +106,95 @@ class MonoInputPipelineCommonMixin: ...@@ -166,6 +106,95 @@ class MonoInputPipelineCommonMixin:
) )
self._test_pipeline(nlp) self._test_pipeline(nlp)
def _test_pipeline(self, nlp: Pipeline):
raise NotImplementedError
@require_torch
def test_compare_slow_fast_torch(self):
for model_name in self.small_models:
nlp_slow = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="pt",
use_fast=False,
**self.pipeline_loading_kwargs,
)
nlp_fast = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="pt",
use_fast=True,
**self.pipeline_loading_kwargs,
)
self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="forward")
@require_tf
def test_compare_slow_fast_tf(self):
for model_name in self.small_models:
nlp_slow = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="tf",
use_fast=False,
**self.pipeline_loading_kwargs,
)
nlp_fast = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="tf",
use_fast=True,
**self.pipeline_loading_kwargs,
)
self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="call")
def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline, method: str):
"""We check that the inputs to the models forward passes are identical for
slow and fast tokenizers.
"""
with mock.patch.object(
nlp_slow.model, method, wraps=getattr(nlp_slow.model, method)
) as mock_slow, mock.patch.object(nlp_fast.model, method, wraps=getattr(nlp_fast.model, method)) as mock_fast:
for inputs in self.valid_inputs:
if isinstance(inputs, dict):
inputs.update(self.pipeline_running_kwargs)
_ = nlp_slow(**inputs)
_ = nlp_fast(**inputs)
else:
_ = nlp_slow(inputs, **self.pipeline_running_kwargs)
_ = nlp_fast(inputs, **self.pipeline_running_kwargs)
mock_slow.assert_called()
mock_fast.assert_called()
self.assertEqual(len(mock_slow.call_args_list), len(mock_fast.call_args_list))
for mock_slow_call_args, mock_fast_call_args in zip(
mock_slow.call_args_list, mock_slow.call_args_list
):
slow_call_args, slow_call_kwargs = mock_slow_call_args
fast_call_args, fast_call_kwargs = mock_fast_call_args
slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
self.assertEqual(slow_call_args, fast_call_args)
self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
@is_pipeline_test
class MonoInputPipelineCommonMixin(CustomInputPipelineCommonMixin):
"""A version of the CustomInputPipelineCommonMixin
with a predefined `_test_pipeline` method.
"""
mandatory_keys = {} # Keys which should be in the output
invalid_inputs = [None] # inputs which are not allowed
expected_multi_result: Optional[List] = None
expected_check_keys: Optional[List[str]] = None
def _test_pipeline(self, nlp: Pipeline): def _test_pipeline(self, nlp: Pipeline):
self.assertIsNotNone(nlp) self.assertIsNotNone(nlp)
...@@ -199,76 +228,3 @@ class MonoInputPipelineCommonMixin: ...@@ -199,76 +228,3 @@ class MonoInputPipelineCommonMixin:
self.assertIn(key, result) self.assertIn(key, result)
self.assertRaises(Exception, nlp, self.invalid_inputs) self.assertRaises(Exception, nlp, self.invalid_inputs)
# @is_pipeline_test
# class DefaultArgumentHandlerTestCase(unittest.TestCase):
# def setUp(self) -> None:
# self.handler = DefaultArgumentHandler()
#
# def test_kwargs_x(self):
# mono_data = {"X": "This is a sample input"}
# mono_args = self.handler(**mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# multi_data = {"x": ["This is a sample input", "This is a second sample input"]}
# multi_args = self.handler(**multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)
#
# def test_kwargs_data(self):
# mono_data = {"data": "This is a sample input"}
# mono_args = self.handler(**mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# multi_data = {"data": ["This is a sample input", "This is a second sample input"]}
# multi_args = self.handler(**multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)
#
# def test_multi_kwargs(self):
# mono_data = {"data": "This is a sample input", "X": "This is a sample input 2"}
# mono_args = self.handler(**mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 2)
#
# multi_data = {
# "data": ["This is a sample input", "This is a second sample input"],
# "test": ["This is a sample input 2", "This is a second sample input 2"],
# }
# multi_args = self.handler(**multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 4)
#
# def test_args(self):
# mono_data = "This is a sample input"
# mono_args = self.handler(mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# mono_data = ["This is a sample input"]
# mono_args = self.handler(mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# multi_data = ["This is a sample input", "This is a second sample input"]
# multi_args = self.handler(multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)
#
# multi_data = ["This is a sample input", "This is a second sample input"]
# multi_args = self.handler(*multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)
import unittest
from transformers.pipelines import Conversation, Pipeline
from .test_pipelines_common import CustomInputPipelineCommonMixin
class DialoguePipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
pipeline_task = "conversational"
small_models = [] # Default model - Models tested without the @slow decorator
large_models = ["microsoft/DialoGPT-medium"] # Models tested with the @slow decorator
def _test_pipeline(self, nlp: Pipeline):
valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
invalid_inputs = ["Hi there!", Conversation()]
self.assertIsNotNone(nlp)
mono_result = nlp(valid_inputs[0])
self.assertIsInstance(mono_result, Conversation)
multi_result = nlp(valid_inputs[1])
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], Conversation)
# Inactive conversations passed to the pipeline raise a ValueError
self.assertRaises(ValueError, nlp, valid_inputs[1])
for bad_input in invalid_inputs:
self.assertRaises(Exception, nlp, bad_input)
self.assertRaises(Exception, nlp, invalid_inputs)
...@@ -146,10 +146,10 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): ...@@ -146,10 +146,10 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
@require_torch @require_torch
def test_pt_ignore_subwords_slow_tokenizer_raises(self): def test_pt_ignore_subwords_slow_tokenizer_raises(self):
for model_name in self.small_models: for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True) pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True, use_fast=False)
@require_torch @require_torch
def test_pt_defaults_slow_tokenizer(self): def test_pt_defaults_slow_tokenizer(self):
......
...@@ -8,10 +8,22 @@ from .test_pipelines_common import CustomInputPipelineCommonMixin ...@@ -8,10 +8,22 @@ from .test_pipelines_common import CustomInputPipelineCommonMixin
class QAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): class QAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
pipeline_task = "question-answering" pipeline_task = "question-answering"
pipeline_running_kwargs = {
"padding": "max_length",
"max_seq_len": 25,
"doc_stride": 5,
} # Default is 'longest' but we use 'max_length' to test equivalence between slow/fast tokenizers
small_models = [ small_models = [
"sshleifer/tiny-distilbert-base-cased-distilled-squad" "sshleifer/tiny-distilbert-base-cased-distilled-squad"
] # Models tested without the @slow decorator ] # Models tested without the @slow decorator
large_models = [] # Models tested with the @slow decorator large_models = [] # Models tested with the @slow decorator
valid_inputs = [
{"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
{
"question": "In what field is HuggingFace working ?",
"context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
},
]
def _test_pipeline(self, nlp: Pipeline): def _test_pipeline(self, nlp: Pipeline):
output_keys = {"score", "answer", "start", "end"} output_keys = {"score", "answer", "start", "end"}
......
...@@ -12,6 +12,18 @@ class ZeroShotClassificationPipelineTests(CustomInputPipelineCommonMixin, unitte ...@@ -12,6 +12,18 @@ class ZeroShotClassificationPipelineTests(CustomInputPipelineCommonMixin, unitte
"sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english" "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
] # Models tested without the @slow decorator ] # Models tested without the @slow decorator
large_models = ["roberta-large-mnli"] # Models tested with the @slow decorator large_models = ["roberta-large-mnli"] # Models tested with the @slow decorator
valid_inputs = [
{"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"},
{"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]},
{"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"},
{"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]},
{"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"},
{
"sequences": "Who are you voting for in 2020?",
"candidate_labels": "politics",
"hypothesis_template": "This text is about {}",
},
]
def _test_scores_sum_to_one(self, result): def _test_scores_sum_to_one(self, result):
sum = 0.0 sum = 0.0
......
...@@ -9,7 +9,7 @@ from unittest.mock import patch ...@@ -9,7 +9,7 @@ from unittest.mock import patch
import numpy as np import numpy as np
from datasets import Dataset from datasets import Dataset
import faiss from transformers import is_faiss_available
from transformers.configuration_bart import BartConfig from transformers.configuration_bart import BartConfig
from transformers.configuration_dpr import DPRConfig from transformers.configuration_dpr import DPRConfig
from transformers.configuration_rag import RagConfig from transformers.configuration_rag import RagConfig
...@@ -27,6 +27,10 @@ from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer ...@@ -27,6 +27,10 @@ from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
if is_faiss_available():
import faiss
@require_faiss @require_faiss
@require_datasets @require_datasets
class RagRetrieverTest(TestCase): class RagRetrieverTest(TestCase):
......
...@@ -116,5 +116,5 @@ class AutoTokenizerTest(unittest.TestCase): ...@@ -116,5 +116,5 @@ class AutoTokenizerTest(unittest.TestCase):
@require_tokenizers @require_tokenizers
def test_from_pretrained_use_fast_toggle(self): def test_from_pretrained_use_fast_toggle(self):
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer) self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast) self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment