Unverified Commit 7898fc03 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add `from_slow` in fast tokenizers build and fixes some bugs (#9987)

parent 6244727e
......@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note:: When building a sequence using special tokens, this is not the token that is used for the
beginning of sequence. The token used is the :obj:`cls_token`.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`.
......@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
modeling. This is the token which the model will try to predict.
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
HerbertTokenizerFast,
PhobertTokenizer,
BarthezTokenizer,
BarthezTokenizerFast,
]
......
......@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
r"""
Construct a BART tokenizer.
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
initialization parameters and other methods.
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
......
......@@ -37,6 +37,13 @@ _all_bart_models = [
class BartTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {
......
......@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
Construct a Blenderbot tokenizer.
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
......
......@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def _from_pretrained(
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
):
# We instantiate fast tokenizers based on a slow tokenizer for now
# In the future we can also use a direct way based on saving/instantiating
# tokenizer's Tokenizer directly from it's serialization JSON
if (
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
) and cls.slow_tokenizer_class is not None:
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
# file or if `from_slow` is set to True.
from_slow = kwargs.get("from_slow", False)
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path,
......
......@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
def __init__(self, *args, **kwargs):
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
if fast_tokenizer_file is not None:
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
raise ValueError(
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
"have sentencepiece installed."
)
if fast_tokenizer_file is not None and not from_slow:
# We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
elif slow_tokenizer is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment