"csrc/quantization/git@developer.sourcefind.cn:norm/vllm.git" did not exist on "2b1c116b5acdf3b738e310f98617875132214c37"
Unverified Commit 9aeacb58 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...


Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141)

* [WIP] SP tokenizers

* fixing tests for T5

* WIP tokenizers

* serialization

* update T5

* WIP T5 tokenization

* slow to fast conversion script

* Refactoring to move tokenzier implementations inside transformers

* Adding gpt - refactoring - quality

* WIP adding several tokenizers to the fast world

* WIP Roberta - moving implementations

* update to dev4 switch file loading to in-memory loading

* Updating and fixing

* advancing on the tokenizers - updating do_lower_case

* style and quality

* moving forward with tokenizers conversion and tests

* MBart, T5

* dumping the fast version of transformer XL

* Adding to autotokenizers + style/quality

* update init and space_between_special_tokens

* style and quality

* bump up tokenizers version

* add protobuf

* fix pickle Bert JP with Mecab

* fix newly added tokenizers

* style and quality

* fix bert japanese

* fix funnel

* limite tokenizer warning to one occurence

* clean up file

* fix new tokenizers

* fast tokenizers deep tests

* WIP adding all the special fast tests on the new fast tokenizers

* quick fix

* adding more fast tokenizers in the fast tests

* all tokenizers in fast version tested

* Adding BertGenerationFast

* bump up setup.py for CI

* remove BertGenerationFast (too early)

* bump up tokenizers version

* Clean old docstrings

* Typo

* Update following Lysandre comments
Co-authored-by: default avatarSylvain Gugger <sylvain.gugger@gmail.com>
parent 4d04120c
......@@ -65,3 +65,4 @@ class MobileBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = MobileBertTokenizer
......@@ -19,8 +19,6 @@ import json
import os
import re
from tokenizers import CharBPETokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
......@@ -123,6 +121,10 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
@property
def do_lower_case(self):
return True
@property
def vocab_size(self):
return len(self.encoder)
......@@ -243,9 +245,8 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
the following peculiarities:
- lowercases all inputs,
- uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
:obj:`BasicTokenizer` if not.
- lower case all inputs
- uses BERT's BasicTokenizer for pre-BPE tokenization
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
......@@ -264,10 +265,11 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = OpenAIGPTTokenizer
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
kwargs.setdefault("unk_token", unk_token)
super().__init__(
CharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, lowercase=True),
**kwargs,
)
super().__init__(vocab_file, merges_file, unk_token=unk_token, **kwargs)
@property
def do_lower_case(self):
return True
......@@ -15,10 +15,23 @@
from typing import Dict, List, Optional
from .file_utils import add_start_docstrings
from .tokenization_reformer import ReformerTokenizer
from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast
from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/pegasus-xsum": 512,
}
class PegasusTokenizer(ReformerTokenizer):
r"""
Construct a Pegasus tokenizer.
......@@ -31,6 +44,8 @@ class PegasusTokenizer(ReformerTokenizer):
"""
offset = 103 # entries 2-104 are only used for pretraining
vocab_files_names = {"vocab_file": "spiece.model"}
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
......@@ -150,3 +165,85 @@ class PegasusTokenizer(ReformerTokenizer):
# for k, v in decoder_inputs.items():
# model_inputs[f"decoder_{k}"] = v
return model_inputs
class PegasusTokenizerFast(ReformerTokenizerFast):
offset = 103 # entries 2-104 are only used for pretraining
vocab_files_names = {"vocab_file": "spiece.model"}
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = PegasusTokenizer
# def num_special_tokens_to_add(self, pair=False):
# """Just EOS"""
# return 1
def _special_token_mask(self, seq):
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
assert all_special_ids == set([0, 1])
return [1 if x in all_special_ids else 0 for x in seq]
def get_special_tokens_mask(
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""Get list where entries are [1] if a token is [eos] or [pad] else 0."""
if already_has_special_tokens:
return self._special_token_mask(token_ids_0)
elif token_ids_1 is None:
return self._special_token_mask(token_ids_0) + [1]
else:
return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
"""
Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
- single sequence: ``X </s>``
- pair of sequences: ``A B </s>`` (not intended use)
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return token_ids_0 + [self.eos_token_id]
# We don't expect to process pairs, but leave the pair logic for API consistency
return token_ids_0 + token_ids_1 + [self.eos_token_id]
@add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
return_tensors: str = "pt",
truncation=True,
padding="longest",
**unused,
) -> BatchEncoding:
if "" in src_texts:
raise ValueError(f"found empty string in src_texts: {src_texts}")
tokenizer_kwargs = dict(
add_special_tokens=True,
return_tensors=return_tensors,
max_length=max_length,
truncation=truncation,
padding=padding,
)
model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
if tgt_texts is None:
return model_inputs
if max_target_length is not None:
tokenizer_kwargs["max_length"] = max_target_length
# TODO(@sshleifer): maybe tgt_texts = [self.pad_token + t for t in tgt_texts] # add decoder_start_token_id
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
model_inputs["labels"] = labels
# for k, v in decoder_inputs.items():
# model_inputs[f"decoder_{k}"] = v
return model_inputs
......@@ -126,7 +126,6 @@ class PhobertTokenizer(PreTrainedTokenizer):
**kwargs
):
super().__init__(
max_len=256,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
......
......@@ -19,6 +19,7 @@ import os
from shutil import copyfile
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging
......@@ -184,3 +185,72 @@ class ReformerTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
class ReformerTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
<https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = ReformerTokenizer
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
additional_special_tokens=[],
**kwargs
):
super().__init__(
vocab_file,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
def save_vocabulary(self, save_directory):
"""Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
......@@ -71,4 +71,5 @@ class RetriBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = RetriBertTokenizer
model_input_names = ["attention_mask"]
......@@ -17,8 +17,6 @@
import warnings
from typing import List, Optional
from tokenizers.processors import RobertaProcessing
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
from .tokenization_utils import AddedToken
from .utils import logging
......@@ -344,6 +342,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = RobertaTokenizer
def __init__(
self,
......@@ -358,38 +357,23 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
trim_offsets=True,
**kwargs
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
kwargs.setdefault("pad_token", pad_token)
kwargs.setdefault("sep_token", sep_token)
kwargs.setdefault("cls_token", cls_token)
kwargs.setdefault("mask_token", mask_token)
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
unk_token=unk_token,
vocab_file,
merges_file,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
**kwargs,
)
# This will add the necessary special tokens to the vocabulary if needed
self.sanitize_special_tokens()
self.backend_tokenizer._tokenizer.post_processor = RobertaProcessing(
sep=(sep_token, self.sep_token_id),
cls=(cls_token, self.cls_token_id),
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
......
......@@ -24,6 +24,7 @@ from typing import List, Optional
from .file_utils import add_start_docstrings
from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging
......@@ -322,3 +323,161 @@ class T5Tokenizer(PreTrainedTokenizer):
)
model_inputs["labels"] = labels_and_decoder_mask["input_ids"]
return model_inputs
class T5TokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
<https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`int`, `optional`, defaults to 100):
Add a number of extra ids added to the end of the vocabulary for use as sentinels.
These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token
in the vocabulary like in T5 preprocessing see `here
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = T5Tokenizer
prefix_tokens: List[int] = []
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
**kwargs
):
super().__init__(
vocab_file,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self._extra_ids = extra_ids
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A sequence has the following format:
- single sequence: ``X </s>``
- pair of sequences: ``A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
token_ids_0 = token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0
else:
token_ids_1 = token_ids_1 + [self.eos_token_id]
return self.prefix_tokens + token_ids_0 + token_ids_1
@add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
padding: str = "longest",
return_tensors: str = None,
truncation: bool = True,
**kwargs,
) -> BatchEncoding:
if max_length is None:
max_length = self.max_len
self.prefix_tokens = []
model_inputs = self(
src_texts,
add_special_tokens=True,
return_tensors=return_tensors,
max_length=max_length,
padding=padding,
truncation=truncation,
**kwargs,
)
if tgt_texts is None:
return model_inputs
# Process tgt_texts
if max_target_length is None:
max_target_length = max_length
# set prefix_tokens for target text
self.prefix_tokens = [self.pad_token_id]
labels_and_decoder_mask = self(
tgt_texts,
add_special_tokens=True,
return_tensors=return_tensors,
padding=padding,
max_length=max_target_length,
truncation=truncation,
**kwargs,
)
model_inputs["labels"] = labels_and_decoder_mask["input_ids"]
self.prefix_tokens = []
return model_inputs
......@@ -22,23 +22,15 @@ import glob
import os
import pickle
import re
import warnings
from collections import Counter, OrderedDict
from typing import List, Optional
from typing import List
import numpy as np
import sacremoses as sm
from tokenizers import Tokenizer
from tokenizers.implementations import BaseTokenizer
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase, Sequence, Strip, unicode_normalizer_from_str
from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit
from tokenizers.processors import BertProcessing
from .file_utils import cached_path, is_torch_available, torch_only_method
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging
......@@ -53,7 +45,6 @@ VOCAB_FILES_NAMES = {
"pretrained_vocab_file_torch": "vocab.bin",
"vocab_file": "vocab.txt",
}
VOCAB_FILES_NAMES_FAST = {"pretrained_vocab_file": "vocab.json", "vocab_file": "vocab.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"pretrained_vocab_file": {
......@@ -61,12 +52,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
}
}
PRETRAINED_VOCAB_FILES_MAP_FAST = {
"pretrained_vocab_file": {
"transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.json",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"transfo-xl-wt103": None,
}
......@@ -240,6 +225,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if vocab_file is not None:
self.build_vocab()
@property
def do_lower_case(self):
return self.lower_case
def _compile_space_around_punctuation_pattern(self):
look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols)
look_ahead_to_match_all_except_space = r"(?=[^\s])"
......@@ -299,11 +288,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
:obj:`Tuple(str)`: Paths to the files saved.
"""
logger.warning(
"Please note you will not be able to load the save vocabulary in"
" Rust-based TransfoXLTokenizerFast as they don't share the same structure."
)
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
else:
......@@ -492,165 +476,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return symbols
class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
def __init__(
self,
vocab_file,
delimiter,
lowercase,
unk_token,
eos_token,
add_eos=False,
add_double_eos=False,
normalization: Optional[str] = None,
):
try:
tokenizer = WordLevel(vocab_file, unk_token=unk_token)
tokenizer = Tokenizer(tokenizer)
except Exception:
raise ValueError(
"Unable to parse file {}. Unknown format. "
"If you tried to load a model saved through TransfoXLTokenizer,"
"please note they are not compatible.".format(vocab_file)
)
# Create the correct normalization path
normalizer = []
# Include unicode normalization
if normalization:
normalizer += [unicode_normalizer_from_str(normalization)]
# Include case normalization
if lowercase:
normalizer += [Lowercase()]
# Strip normalizer at the end
normalizer += [Strip(left=True, right=True)]
if len(normalizer) > 0:
tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0]
# Setup the splitter
tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit()
if add_double_eos:
tokenizer.post_processor = BertProcessing(
(eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))
)
parameters = {
"model": "TransfoXLModel",
"add_eos": add_eos,
"add_double_eos": add_double_eos,
"unk_token": unk_token,
"eos_token": eos_token,
"delimiter": delimiter,
"lowercase": lowercase,
}
super().__init__(tokenizer, parameters)
class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" Transformer-XL tokenizer (backed by HuggingFace's `tokenizers` library) adapted from Vocab class
in `the original code <https://github.com/kimiyoung/transformer-xl>`__. The Transformer-XL tokenizer is a
word-level tokenizer (no sub-word tokenization).
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
special (:obj:`List[str]`, `optional`):
A list of special tokens (to be treated by the original implementation of this tokenizer).
min_freq (:obj:`int`, `optional`, defaults to 0):
The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
will be mapped to :obj:`unk_token`).
max_size (:obj:`int`, `optional`):
The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
after excluding the tokens according to the :obj:`min_freq` rule.
lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to lowercase the input when tokenizing.
delimiter (:obj:`str`, `optional`):
The delimiter used btween tokens.
vocab_file (:obj:`str`, `optional`):
File containing the vocabulary (from the original implementation).
pretrained_vocab_file (:obj:`str`, `optional`):
File containing the vocabulary as saved with the :obj:`save_pretrained()` method.
never_split (xxx, `optional`):
Fill me with intesting stuff.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"<eos>"`):
The end of sequence token.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<formula>"]`):
A list of additional special tokens (for the HuggingFace functionality).
add_eos (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add the end-of-sentence token.
add_double_eos (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add the end-of-sentence token.
normalization (xxx, `optional`):
Fill me with intesting stuff.
"""
vocab_files_names = VOCAB_FILES_NAMES_FAST
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = []
def __init__(
self,
special=None,
min_freq=0,
max_size=None,
lower_case=False,
delimiter=None,
vocab_file=None,
pretrained_vocab_file=None,
never_split=None,
unk_token="<unk>",
eos_token="<eos>",
additional_special_tokens=["<formula>"],
add_eos=False,
add_double_eos=False,
normalization=None,
**kwargs
):
super().__init__(
_TransfoXLDelimiterLookupTokenizer(
vocab_file=vocab_file or pretrained_vocab_file,
delimiter=delimiter,
lowercase=lower_case,
unk_token=unk_token,
eos_token=eos_token,
add_eos=add_eos,
add_double_eos=add_double_eos,
normalization=normalization,
),
unk_token=unk_token,
eos_token=eos_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
warnings.warn(
"The class `TransfoXLTokenizerFast` is deprecated and will be removed in a future version. Please use `TransfoXLTokenizer` with it's enhanced tokenization instead.",
FutureWarning,
)
def save_pretrained(self, save_directory):
logger.warning(
"Please note you will not be able to load the vocabulary in"
" Python-based TransfoXLTokenizer as they don't share the same structure."
)
return super().save_pretrained(save_directory)
class LMOrderedIterator(object):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
"""
......
......@@ -15,7 +15,6 @@
""" Tokenization classes for python tokenizers.
For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
"""
import itertools
import re
import unicodedata
......@@ -45,6 +44,11 @@ from .utils import logging
logger = logging.get_logger(__name__)
# Slow tokenizers are saved in a vocabulary plus three separated files
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
def _is_whitespace(char):
"""Checks whether `char` is a whitespace character."""
......@@ -190,7 +194,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
tokens_to_add = []
for token in new_tokens:
assert isinstance(token, str)
if not special_tokens and self.init_kwargs.get("do_lower_case", False):
if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
token = token.lower()
if (
token != self.unk_token
......@@ -239,6 +243,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
"""
Converts a string in a sequence of tokens, using the tokenizer.
Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method
won't replace the unknown tokens with the `unk_token` yet (this is done in the `encode()` method)
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Takes care of added tokens.
......@@ -268,7 +275,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
logger.warning(f"Keyword arguments {kwargs} not recognized.")
# TODO: should this be in the base class?
if self.init_kwargs.get("do_lower_case", False):
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
......@@ -740,7 +747,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
return " ".join(tokens)
def decode(
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
spaces_between_special_tokens: bool = True,
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary
......@@ -755,6 +766,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to clean up the tokenization spaces.
spaces_between_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add spaces around special tokens.
The behavior of Fast tokenizers is to have this to :obj:`False`.
This is setup to :obj:`True` in slow tokenizers for backward compatibility.
Returns:
:obj:`str`: The decoded sentence.
......@@ -778,7 +793,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = " ".join(sub_texts)
if spaces_between_special_tokens:
text = " ".join(sub_texts)
else:
text = "".join(sub_texts)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
......
......@@ -646,6 +646,8 @@ class SpecialTokensMixin:
# which are not yet in the vocabulary. Necesssary for serialization/de-serialization
# TODO clean this up at some point (probably by sitching to fast tokenizers)
for key, value in kwargs.items():
if value is None:
continue
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
......@@ -778,6 +780,9 @@ class SpecialTokensMixin:
return self._add_tokens(new_tokens, special_tokens=special_tokens)
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
raise NotImplementedError
@property
def bos_token(self) -> str:
"""
......@@ -1293,11 +1298,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_model_input_sizes: Dict[str, Optional[int]] = {}
model_input_names: List[str] = ["token_type_ids", "attention_mask"]
padding_side: str = "right"
slow_tokenizer_class = None
def __init__(self, **kwargs):
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
self.init_inputs = ()
self.init_kwargs = kwargs
self.init_kwargs = copy.deepcopy(kwargs)
# For backward compatibility we fallback to set model_max_length from max_len if provided
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
......@@ -1311,6 +1317,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
self.deprecation_warnings = (
{}
) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
super().__init__(**kwargs)
@property
......@@ -1343,9 +1353,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def max_len_single_sentence(self, value) -> int:
# For backward compatibility, allow to try to setup 'max_len_single_sentence'.
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
logger.warning(
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
)
if not self.deprecation_warnings.get("max_len_single_sentence", False):
logger.warning(
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
)
self.deprecation_warnings["max_len_single_sentence"] = True
else:
raise ValueError(
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
......@@ -1355,16 +1367,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def max_len_sentences_pair(self, value) -> int:
# For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
logger.warning(
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
)
if not self.deprecation_warnings.get("max_len_sentences_pair", False):
logger.warning(
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
)
self.deprecation_warnings["max_len_sentences_pair"] = True
else:
raise ValueError(
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
)
@classmethod
def from_pretrained(cls, *inputs, **kwargs):
def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
r"""
Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
a predefined tokenizer.
......@@ -1425,10 +1439,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
assert tokenizer.unk_token == '<unk>'
"""
return cls._from_pretrained(*inputs, **kwargs)
@classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
......@@ -1475,7 +1485,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"added_tokens_file": ADDED_TOKENS_FILE,
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
"full_tokenizer_file": FULL_TOKENIZER_FILE,
"tokenizer_file": FULL_TOKENIZER_FILE,
}
# Look for the tokenizer files
for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
......@@ -1541,6 +1551,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
else:
logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
return cls._from_pretrained(
resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
)
@classmethod
def _from_pretrained(
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
):
# We instantiate fast tokenizers based on a slow tokenizer for now
# In the future we can also use a direct way based on saving/instantiating
# tokenizer's Tokenizer directly from it's serialization JSON
if cls.slow_tokenizer_class is not None:
slow_tokenizer = cls.slow_tokenizer_class._from_pretrained(
copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path,
copy.deepcopy(init_configuration),
*init_inputs,
**(copy.deepcopy(kwargs)),
)
else:
slow_tokenizer = None
# Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
......@@ -1556,6 +1588,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Update with newly provided kwargs
init_kwargs.update(kwargs)
# Convert AddedTokens serialized as dict to class instances
def convert_added_tokens(obj: Union[AddedToken, Any]):
if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
obj.pop("__type")
return AddedToken(**obj)
elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o) for o in obj)
elif isinstance(obj, dict):
return {k: convert_added_tokens(v) for k, v in obj.items()}
return obj
init_kwargs = convert_added_tokens(init_kwargs)
# Set max length if needed
if pretrained_model_name_or_path in cls.max_model_input_sizes:
# if we're using a pretrained model, ensure the tokenizer
......@@ -1570,6 +1615,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if args_name not in init_kwargs:
init_kwargs[args_name] = file_path
if slow_tokenizer is not None:
init_kwargs["__slow_tokenizer"] = slow_tokenizer
# Instantiate tokenizer.
try:
tokenizer = cls(*init_inputs, **init_kwargs)
......@@ -1580,8 +1628,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
)
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
tokenizer.init_inputs = init_inputs
tokenizer.init_kwargs = init_kwargs
# Removed: Now done at the base class level
# tokenizer.init_inputs = init_inputs
# tokenizer.init_kwargs = init_kwargs
# If there is a complementary special token map, load it
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
......@@ -1589,11 +1638,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
special_tokens_map = json.load(special_tokens_map_handle)
special_tokens_map = convert_added_tokens(special_tokens_map)
for key, value in special_tokens_map.items():
if isinstance(value, dict):
value = AddedToken(**value)
elif isinstance(value, list):
value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
setattr(tokenizer, key, value)
# Add supplementary tokens.
......@@ -1623,14 +1669,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def save_pretrained(self, save_directory: str) -> Tuple[str]:
"""
Save the tokenizer vocabulary files together with:
Save the full tokenizer state.
- added tokens,
- special tokens to class attributes mapping,
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
This method make sure the full tokenizer can then be re-loaded using the
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` class method.
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
.. Note::
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
this method will not be possible to load back
in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.
.. Warning::
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
......@@ -1648,7 +1697,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
os.makedirs(save_directory, exist_ok=True)
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
tokenizer_config = copy.deepcopy(self.init_kwargs)
......@@ -1657,22 +1705,33 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None)
# Sanitize AddedTokens
def convert_added_tokens(obj: Union[AddedToken, Any]):
if isinstance(obj, AddedToken):
out = obj.__getstate__()
out["__type"] = "AddedToken"
return out
elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o) for o in obj)
elif isinstance(obj, dict):
return {k: convert_added_tokens(v) for k, v in obj.items()}
return obj
tokenizer_config = convert_added_tokens(tokenizer_config)
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
# Sanitize AddedTokens in special_tokens_map
write_dict = convert_added_tokens(self.special_tokens_map_extended)
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
write_dict = {}
for key, value in self.special_tokens_map_extended.items():
if isinstance(value, AddedToken):
write_dict[key] = value.__getstate__()
elif isinstance(value, list):
write_dict[key] = [
token.__getstate__() if isinstance(token, AddedToken) else token for token in value
]
else:
write_dict[key] = value
f.write(json.dumps(write_dict, ensure_ascii=False))
file_names = (tokenizer_config_file, special_tokens_map_file)
return self._save_pretrained(save_directory, file_names)
def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]:
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
added_vocab = self.get_added_vocab()
if added_vocab:
with open(added_tokens_file, "w", encoding="utf-8") as f:
......@@ -1681,7 +1740,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
vocab_files = self.save_vocabulary(save_directory)
return vocab_files + (special_tokens_map_file, added_tokens_file)
return file_names + (vocab_files, added_tokens_file)
@add_end_docstrings(
ENCODE_KWARGS_DOCSTRING,
......@@ -1752,13 +1811,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# If you only set max_length, it activates truncation for max_length
if max_length is not None and padding is False and truncation is False:
if verbose:
logger.warning(
"Truncation was not explicitely activated but `max_length` is provided a specific value, "
"please use `truncation=True` to explicitely truncate examples to max length. "
"Defaulting to 'longest_first' truncation strategy. "
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
"more precisely by providing a specific strategy to `truncation`."
)
if not self.deprecation_warnings.get("Truncation-not-explicitely-activated", False):
logger.warning(
"Truncation was not explicitely activated but `max_length` is provided a specific value, "
"please use `truncation=True` to explicitely truncate examples to max length. "
"Defaulting to 'longest_first' truncation strategy. "
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
"more precisely by providing a specific strategy to `truncation`."
)
self.deprecation_warnings["Truncation-not-explicitely-activated"] = True
truncation = "longest_first"
# Get padding strategy
......@@ -1818,10 +1879,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if padding_strategy == PaddingStrategy.MAX_LENGTH:
if self.model_max_length > LARGE_INTEGER:
if verbose:
logger.warning(
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
"Default to no padding."
)
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
logger.warning(
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
"Default to no padding."
)
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
padding_strategy = PaddingStrategy.DO_NOT_PAD
else:
max_length = self.model_max_length
......@@ -1829,10 +1892,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
if self.model_max_length > LARGE_INTEGER:
if verbose:
logger.warning(
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
"Default to no truncation."
)
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
logger.warning(
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
"Default to no truncation."
)
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
else:
max_length = self.model_max_length
......@@ -2437,6 +2502,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0
if return_token_type_ids is not None and not add_special_tokens:
raise ValueError(
"Asking to return token_type_ids while setting add_special_tokens to False "
"results in an undefined behavior. Please set add_special_tokens to True or "
"set return_token_type_ids to None."
)
# Load from model defaults
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
......@@ -2469,7 +2541,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
else:
sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
# Build output dictionnary
encoded_inputs["input_ids"] = sequence
......@@ -2483,11 +2555,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Check lengths
if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose:
logger.warning(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
)
if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
logger.warning(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
)
self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
# Padding
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
......@@ -2703,7 +2777,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
]
def decode(
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary
......
......@@ -16,16 +16,19 @@
For slow (python) tokenizers see tokenization_utils.py
"""
import copy
import os
import warnings
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union
from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast
from tokenizers.decoders import Decoder as DecoderFast
from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast
from .convert_slow_tokenizer import convert_slow_tokenizer
from .file_utils import add_end_docstrings
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_base import (
INIT_TOKENIZER_DOCSTRING,
AddedToken,
......@@ -44,6 +47,15 @@ from .utils import logging
logger = logging.get_logger(__name__)
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
TOKENIZER_FILE = "tokenizer.json"
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
# Slow tokenizers have an additional addedd tokens files
ADDED_TOKENS_FILE = "added_tokens.json"
@add_end_docstrings(
INIT_TOKENIZER_DOCSTRING,
"""
......@@ -64,12 +76,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
dictionary structures (BPE, sentencepiece...).
"""
def __init__(self, tokenizer: BaseTokenizerFast, **kwargs):
if not isinstance(tokenizer, BaseTokenizerFast):
raise ValueError(
"Tokenizer should be an instance of a BaseTokenizer " "provided by HuggingFace tokenizers library."
)
self._tokenizer: BaseTokenizerFast = tokenizer
slow_tokenizer_class: PreTrainedTokenizer = None
def __init__(self, *args, **kwargs):
# We instantiate fast tokenizers based on a slow tokenizer for now
# In the future we'll also use a direct way based on saving/instantiating
# tokenizer's Tokenizer directly from it's serialization JSON
if "__slow_tokenizer" in kwargs and kwargs["__slow_tokenizer"]:
slow_tokenizer = kwargs.pop("__slow_tokenizer")
else:
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
self._tokenizer = convert_slow_tokenizer(slow_tokenizer)
kwargs = copy.deepcopy(slow_tokenizer.init_kwargs)
# We call this after having initialized the backend tokenizer because we update it.
super().__init__(**kwargs)
......@@ -116,7 +135,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
return self._tokenizer.get_vocab_size(with_added_tokens=True)
@property
def backend_tokenizer(self) -> BaseTokenizerFast:
def backend_tokenizer(self) -> TokenizerFast:
"""
:obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
"""
......@@ -259,6 +278,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
"""
Converts a string in a sequence of tokens, using the backend Rust tokenizer.
Note that, unlike slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
will replace the unknown tokens with the :obj:`unk_token`.
Args:
text (:obj:`str`):
The sequence to be encoded.
......@@ -343,7 +365,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
) -> BatchEncoding:
if not isinstance(batch_text_or_text_pairs, list):
raise ValueError(
raise TypeError(
"batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
)
......@@ -487,7 +509,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
return batched_output
def decode(
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
self,
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary
......@@ -496,7 +522,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
Args:
token_ids (:obj:`List[int]`):
token_ids (:obj:`Union[int, List[int]]`):
List of tokenized input ids. Can be obtained using the ``__call__`` method.
skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to remove special tokens in the decoding.
......@@ -506,6 +532,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
Returns:
:obj:`str`: The decoded sentence.
"""
if isinstance(token_ids, int):
token_ids = [token_ids]
text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
if clean_up_tokenization_spaces:
......@@ -520,8 +548,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
and special token mappings.
.. warning::
Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if
you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
Please use :meth:`~transformers.PreTrainedTokenizerFast.save_pretrained` to save the full tokenizer state if
you want to reload it using the :meth:`~transformers.PreTrainedTokenizerFast.from_pretrained` class method.
Args:
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
......@@ -530,7 +558,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
A tuple of :obj:`str`: The files saved.
"""
if os.path.isdir(save_directory):
files = self._tokenizer.save_model(save_directory)
files = self._tokenizer.model.save(save_directory)
else:
folder, file = os.path.split(os.path.abspath(save_directory))
files = self._tokenizer.save_model(folder, name=file)
......
......@@ -648,6 +648,10 @@ class XLMTokenizer(PreTrainedTokenizer):
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
@property
def do_lower_case(self):
return self.do_lowercase_and_remove_accent
def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
......
......@@ -20,6 +20,7 @@ from shutil import copyfile
from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .tokenization_xlnet import SPIECE_UNDERLINE
from .utils import logging
......@@ -307,3 +308,190 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
:class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece
<https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = XLMRobertaTokenizer
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
**kwargs
):
super().__init__(
vocab_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs,
)
self.vocab_file = vocab_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An XLM-RoBERTa sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
......@@ -21,6 +21,7 @@ from shutil import copyfile
from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging
......@@ -344,3 +345,213 @@ class XLNetTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
class XLNetTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on
`SentencePiece <https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
padding_side = "left"
slow_tokenizer_class = XLNetTokenizer
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"],
**kwargs
):
super().__init__(
vocab_file=vocab_file,
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self._pad_token_type_id = 3
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An XLNet sequence has the following format:
- single sequence: ``X <sep> <cls>``
- pair of sequences: ``A <sep> B <sep> <cls>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLNet sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls_segment_id = [2]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] + cls_segment_id
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
This diff is collapsed.
......@@ -17,7 +17,7 @@
import os
import unittest
from transformers.tokenization_albert import AlbertTokenizer
from transformers.tokenization_albert import AlbertTokenizer, AlbertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin
......@@ -28,6 +28,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = AlbertTokenizer
rust_tokenizer_class = AlbertTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
......@@ -41,6 +43,28 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
output_text = "this is a test"
return input_text, output_text
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_full_tokenizer(self):
tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
......
......@@ -12,6 +12,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BartTokenizer
rust_tokenizer_class = BartTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
......
......@@ -35,7 +35,9 @@ from .test_tokenization_common import TokenizerTesterMixin
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertTokenizer
rust_tokenizer_class = BertTokenizerFast
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self):
super().setUp()
......@@ -61,9 +63,6 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_rust_tokenizer(self, **kwargs):
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running"
......
......@@ -15,6 +15,7 @@
import os
import pickle
import unittest
from transformers.testing_utils import custom_tokenizers
......@@ -33,6 +34,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer
space_between_special_tokens = True
def setUp(self):
super().setUp()
......@@ -87,6 +89,26 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_pickle_mecab_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
self.assertIsNotNone(tokenizer)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
tokens_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(tokens, tokens_loaded)
def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment