Unverified Commit 7898fc03 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add `from_slow` in fast tokenizers build and fixes some bugs (#9987)

parent 6244727e
...@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): ...@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing. Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note:: When building a sequence using special tokens, this is not the token that is used for the
beginning of sequence. The token used is the :obj:`cls_token`. .. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`. that is used for the end of sequence. The token used is the :obj:`sep_token`.
...@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): ...@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
instead of per-token classification). It is the first token of the sequence when built with special tokens. instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. Attributes: modeling. This is the token which the model will try to predict.
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [ ...@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
HerbertTokenizerFast, HerbertTokenizerFast,
PhobertTokenizer, PhobertTokenizer,
BarthezTokenizer, BarthezTokenizer,
BarthezTokenizerFast,
] ]
......
...@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer): ...@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
r""" r"""
Construct a BART tokenizer. Construct a BART tokenizer.
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch` :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
initialization parameters and other methods.
""" """
# merges and vocab same as Roberta # merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models} max_model_input_sizes = {m: 1024 for m in _all_bart_models}
......
...@@ -37,6 +37,13 @@ _all_bart_models = [ ...@@ -37,6 +37,13 @@ _all_bart_models = [
class BartTokenizerFast(RobertaTokenizerFast): class BartTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta # merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models} max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = { pretrained_vocab_files_map = {
......
...@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer): ...@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast): ...@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer): ...@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
Construct a Blenderbot tokenizer. Construct a Blenderbot tokenizer.
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences. to the beginning of sequences.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
......
...@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): ...@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__). <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`): additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): ...@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): ...@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def _from_pretrained( def _from_pretrained(
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
): ):
# We instantiate fast tokenizers based on a slow tokenizer for now # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
# In the future we can also use a direct way based on saving/instantiating # file or if `from_slow` is set to True.
# tokenizer's Tokenizer directly from it's serialization JSON from_slow = kwargs.get("from_slow", False)
if ( has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
) and cls.slow_tokenizer_class is not None:
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
copy.deepcopy(resolved_vocab_files), copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path, pretrained_model_name_or_path,
......
...@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
slow_tokenizer = kwargs.pop("__slow_tokenizer", None) slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None) fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
if fast_tokenizer_file is not None: if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
raise ValueError(
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
"have sentencepiece installed."
)
if fast_tokenizer_file is not None and not from_slow:
# We have a serialization from tokenizers which let us directly build the backend # We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
elif slow_tokenizer is not None: elif slow_tokenizer is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment