"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "6d92c429c72db077fab93ecf2e2d34f8cbb6a523"
Unverified Commit e93763d4 authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

fix CLIP fast tokenizer and change some properties of the slow version (#15067)



Very big changes concerning the tokenizer fast of CLIP which did not correspond to the tokenizer slow of CLIP
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 240cc6cb
...@@ -559,6 +559,10 @@ jobs: ...@@ -559,6 +559,10 @@ jobs:
if [ -f test_list.txt ]; then if [ -f test_list.txt ]; then
python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
fi fi
- run: |
if [ -f test_list.txt ]; then
python -m pytest -n 1 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
fi
- store_artifacts: - store_artifacts:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
......
...@@ -105,6 +105,7 @@ _deps = [ ...@@ -105,6 +105,7 @@ _deps = [
"filelock", "filelock",
"flake8>=3.8.3", "flake8>=3.8.3",
"flax>=0.3.5", "flax>=0.3.5",
"ftfy",
"fugashi>=1.0", "fugashi>=1.0",
"GitPython<3.1.19", "GitPython<3.1.19",
"huggingface-hub>=0.1.0,<1.0", "huggingface-hub>=0.1.0,<1.0",
...@@ -242,6 +243,7 @@ else: ...@@ -242,6 +243,7 @@ else:
extras["flax"] = deps_list("jax", "jaxlib", "flax", "optax") extras["flax"] = deps_list("jax", "jaxlib", "flax", "optax")
extras["tokenizers"] = deps_list("tokenizers") extras["tokenizers"] = deps_list("tokenizers")
extras["ftfy"] = deps_list("ftfy")
extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools") extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxruntime"] extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxruntime"]
extras["modelcreation"] = deps_list("cookiecutter") extras["modelcreation"] = deps_list("cookiecutter")
......
...@@ -823,6 +823,7 @@ class CLIPConverter(Converter): ...@@ -823,6 +823,7 @@ class CLIPConverter(Converter):
def converted(self) -> Tokenizer: def converted(self) -> Tokenizer:
vocab = self.original_tokenizer.encoder vocab = self.original_tokenizer.encoder
merges = list(self.original_tokenizer.bpe_ranks.keys()) merges = list(self.original_tokenizer.bpe_ranks.keys())
unk_token = self.original_tokenizer.unk_token
tokenizer = Tokenizer( tokenizer = Tokenizer(
BPE( BPE(
...@@ -832,13 +833,32 @@ class CLIPConverter(Converter): ...@@ -832,13 +833,32 @@ class CLIPConverter(Converter):
continuing_subword_prefix="", continuing_subword_prefix="",
end_of_word_suffix="</w>", end_of_word_suffix="</w>",
fuse_unk=False, fuse_unk=False,
unk_token=str(unk_token),
) )
) )
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.normalizer = normalizers.Sequence(
[normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase()]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.Split(
Regex(r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""),
behavior="removed",
invert=True,
),
pre_tokenizers.ByteLevel(add_prefix_space=False),
]
)
tokenizer.decoder = decoders.ByteLevel() tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
# Hack to have a ByteLevel and TemplaceProcessor
tokenizer.post_processor = processors.RobertaProcessing(
sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id),
cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id),
add_prefix_space=False,
trim_offsets=False,
)
return tokenizer return tokenizer
......
...@@ -15,6 +15,7 @@ deps = { ...@@ -15,6 +15,7 @@ deps = {
"filelock": "filelock", "filelock": "filelock",
"flake8": "flake8>=3.8.3", "flake8": "flake8>=3.8.3",
"flax": "flax>=0.3.5", "flax": "flax>=0.3.5",
"ftfy": "ftfy",
"fugashi": "fugashi>=1.0", "fugashi": "fugashi>=1.0",
"GitPython": "GitPython<3.1.19", "GitPython": "GitPython<3.1.19",
"huggingface-hub": "huggingface-hub>=0.1.0,<1.0", "huggingface-hub": "huggingface-hub>=0.1.0,<1.0",
......
...@@ -158,6 +158,13 @@ except importlib_metadata.PackageNotFoundError: ...@@ -158,6 +158,13 @@ except importlib_metadata.PackageNotFoundError:
except importlib_metadata.PackageNotFoundError: except importlib_metadata.PackageNotFoundError:
_faiss_available = False _faiss_available = False
_ftfy_available = importlib.util.find_spec("ftfy") is not None
try:
_ftfy_version = importlib_metadata.version("ftfy")
logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
except importlib_metadata.PackageNotFoundError:
_ftfy_available = False
coloredlogs = importlib.util.find_spec("coloredlogs") is not None coloredlogs = importlib.util.find_spec("coloredlogs") is not None
try: try:
...@@ -441,6 +448,10 @@ def is_flax_available(): ...@@ -441,6 +448,10 @@ def is_flax_available():
return _flax_available return _flax_available
def is_ftfy_available():
return _ftfy_available
def is_torch_tpu_available(): def is_torch_tpu_available():
if not _torch_available: if not _torch_available:
return False return False
...@@ -516,10 +527,6 @@ def is_spacy_available(): ...@@ -516,10 +527,6 @@ def is_spacy_available():
return importlib.util.find_spec("spacy") is not None return importlib.util.find_spec("spacy") is not None
def is_ftfy_available():
return importlib.util.find_spec("ftfy") is not None
def is_in_notebook(): def is_in_notebook():
try: try:
# Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
...@@ -722,6 +729,13 @@ FLAX_IMPORT_ERROR = """ ...@@ -722,6 +729,13 @@ FLAX_IMPORT_ERROR = """
installation page: https://github.com/google/flax and follow the ones that match your environment. installation page: https://github.com/google/flax and follow the ones that match your environment.
""" """
# docstyle-ignore
FTFY_IMPORT_ERROR = """
{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
that match your environment.
"""
# docstyle-ignore # docstyle-ignore
SCATTER_IMPORT_ERROR = """ SCATTER_IMPORT_ERROR = """
...@@ -801,6 +815,7 @@ BACKENDS_MAPPING = OrderedDict( ...@@ -801,6 +815,7 @@ BACKENDS_MAPPING = OrderedDict(
("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)), ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)), ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)), ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)), ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)), ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
......
...@@ -48,7 +48,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -48,7 +48,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
PRETRAINED_INIT_CONFIGURATION = { PRETRAINED_INIT_CONFIGURATION = {
"openai/clip-vit-base-patch32": {"do_lower_case": True}, "openai/clip-vit-base-patch32": {},
} }
...@@ -101,19 +101,6 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -101,19 +101,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
""" """
Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding. Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
</Tip>
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods. this superclass for more information regarding those methods.
...@@ -132,9 +119,6 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -132,9 +119,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
The beginning of sequence token. The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`): eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token. The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (CLIP tokenizer detect beginning of words by the preceding space).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -151,8 +135,6 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -151,8 +135,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
bos_token="<|startoftext|>", bos_token="<|startoftext|>",
eos_token="<|endoftext|>", eos_token="<|endoftext|>",
pad_token="<|endoftext|>", # hack to enable padding pad_token="<|endoftext|>", # hack to enable padding
add_prefix_space=False,
do_lower_case=True,
**kwargs **kwargs
): ):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
...@@ -165,8 +147,6 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -165,8 +147,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
pad_token=pad_token, pad_token=pad_token,
add_prefix_space=add_prefix_space,
do_lower_case=do_lower_case,
**kwargs, **kwargs,
) )
...@@ -190,21 +170,12 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -190,21 +170,12 @@ class CLIPTokenizer(PreTrainedTokenizer):
bpe_merges = [tuple(merge.split()) for merge in bpe_merges] bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"} self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
self.add_prefix_space = add_prefix_space
self.pat = re.compile( self.pat = re.compile(
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
re.IGNORECASE, re.IGNORECASE,
) )
# Very ugly hack to enable padding
@property
def pad_token_id(self) -> Optional[int]:
"""
`Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
"""
return 0
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.encoder) return len(self.encoder)
...@@ -232,9 +203,12 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -232,9 +203,12 @@ class CLIPTokenizer(PreTrainedTokenizer):
Returns: Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
bos_token = [self.bos_token_id]
eos_token = [self.eos_token_id]
if token_ids_1 is None: if token_ids_1 is None:
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] return bos_token + token_ids_0 + eos_token
return [self.bos_token_id] + token_ids_0 + token_ids_1 + [self.eos_token_id] return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
def get_special_tokens_mask( def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
...@@ -262,7 +236,30 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -262,7 +236,30 @@ class CLIPTokenizer(PreTrainedTokenizer):
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
bos_token = [self.bos_token_id]
eos_token = [self.eos_token_id]
if token_ids_1 is None:
return len(bos_token + token_ids_0 + eos_token) * [0]
return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
def bpe(self, token): def bpe(self, token):
if token in self.cache: if token in self.cache:
...@@ -332,7 +329,8 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -332,7 +329,8 @@ class CLIPTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string.""" """Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens) text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors).replace("</w>", " ") byte_array = bytearray([self.byte_decoder[c] for c in text])
text = byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
return text return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
...@@ -363,9 +361,3 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -363,9 +361,3 @@ class CLIPTokenizer(PreTrainedTokenizer):
index += 1 index += 1
return vocab_file, merge_file return vocab_file, merge_file
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
...@@ -15,12 +15,10 @@ ...@@ -15,12 +15,10 @@
"""Tokenization classes for OpenAI GPT.""" """Tokenization classes for OpenAI GPT."""
import json from typing import List, Optional, Tuple
from typing import Optional, Tuple
from tokenizers import pre_tokenizers from tokenizers import pre_tokenizers
from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging from ...utils import logging
from .tokenization_clip import CLIPTokenizer from .tokenization_clip import CLIPTokenizer
...@@ -52,27 +50,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): ...@@ -52,27 +50,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding. Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```
>>> from transformers import CLIPTokenizerFast
>>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer("Hello world")['input_ids']
[15496, 995]
>>> tokenizer(" Hello world")['input_ids']
[18435, 995]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
</Tip>
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods. refer to this superclass for more information regarding those methods.
...@@ -81,9 +58,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): ...@@ -81,9 +58,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
Path to the vocabulary file. Path to the vocabulary file.
merges_file (`str`): merges_file (`str`):
Path to the merges file. Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`): unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
...@@ -91,11 +65,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): ...@@ -91,11 +65,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
The beginning of sequence token. The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`): eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token. The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (CLIP tokenizer detect beginning of words by the preceding space).
trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -113,7 +82,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): ...@@ -113,7 +82,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
bos_token="<|startoftext|>", bos_token="<|startoftext|>",
eos_token="<|endoftext|>", eos_token="<|endoftext|>",
pad_token="<|endoftext|>", # hack to enable padding pad_token="<|endoftext|>", # hack to enable padding
add_prefix_space=False,
**kwargs **kwargs
): ):
super().__init__( super().__init__(
...@@ -124,44 +92,81 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): ...@@ -124,44 +92,81 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
pad_token=pad_token, pad_token=pad_token,
add_prefix_space=add_prefix_space,
**kwargs, **kwargs,
) )
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence):
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: raise ValueError(
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) "The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been "
pre_tok_state["add_prefix_space"] = add_prefix_space "heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using to be compatible with this version."
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) "The easiest way to do so is "
'`CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`.'
" If you want to use your existing tokenizer, you will have to revert to a version prior to "
"4.17.0 of transformers."
)
self._wrap_decode_method_backend_tokenizer()
# Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872
def _wrap_decode_method_backend_tokenizer(self):
orig_decode_method = self.backend_tokenizer.decode
def new_decode_method(*args, **kwargs):
text = orig_decode_method(*args, **kwargs)
text = text.replace(self.backend_tokenizer.model.end_of_word_suffix, " ").strip()
return text
self.add_prefix_space = add_prefix_space self.backend_tokenizer.decode = new_decode_method
# Very ugly hack to enable padding def build_inputs_with_special_tokens(
@property self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def pad_token_id(self) -> Optional[int]: ) -> List[int]:
""" """
`Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set. Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CLIP sequence has the following format:
- single sequence: `<|startoftext|> X <|endoftext|>`
Pairs of sequences are not the expected use case, but they will be handled without a separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
return 0 bos_token = [self.bos_token_id]
eos_token = [self.eos_token_id]
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: if token_ids_1 is None:
is_split_into_words = kwargs.get("is_split_into_words", False) return bos_token + token_ids_0 + eos_token
assert self.add_prefix_space or not is_split_into_words, ( return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs) def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
zeros is returned.
def _encode_plus(self, *args, **kwargs) -> BatchEncoding: Args:
is_split_into_words = kwargs.get("is_split_into_words", False) token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
assert self.add_prefix_space or not is_split_into_words, ( Returns:
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " `List[int]`: List of zeros.
"to use it with pretokenized inputs." """
) bos_token = [self.bos_token_id]
eos_token = [self.eos_token_id]
return super()._encode_plus(*args, **kwargs) if token_ids_1 is None:
return len(bos_token + token_ids_0 + eos_token) * [0]
return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix) files = self._tokenizer.model.save(save_directory, name=filename_prefix)
......
...@@ -20,7 +20,7 @@ import unittest ...@@ -20,7 +20,7 @@ import unittest
from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers from transformers.testing_utils import require_ftfy, require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -30,18 +30,20 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -30,18 +30,20 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CLIPTokenizer tokenizer_class = CLIPTokenizer
rust_tokenizer_class = CLIPTokenizerFast rust_tokenizer_class = CLIPTokenizerFast
test_rust_tokenizer = False test_rust_tokenizer = True
from_pretrained_kwargs = {"add_prefix_space": True} from_pretrained_kwargs = {}
test_seq2seq = False test_seq2seq = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
# temporary addition: to test the new slow to fast converter
self.tokenizers_list = [(CLIPTokenizerFast, "SaulLu/clip-vit-base-patch32", {})]
# fmt: off # fmt: off
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|endoftext|>"] vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
# fmt: on # fmt: on
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""] merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
self.special_tokens_map = {"unk_token": "<unk>"} self.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
...@@ -61,148 +63,126 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -61,148 +63,126 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
output_text = "lower newer " output_text = "lower newer"
return input_text, output_text return input_text, output_text
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer" text = "lower newer"
bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"] bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
tokens = tokenizer.tokenize(text, add_prefix_space=True) tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens) self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token] input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16] input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_rust_and_python_full_tokenizers(self): @require_ftfy
if not self.test_rust_tokenizer: def test_check_encoding_slow_fast(self):
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
sequence = "lower newer"
# Testing tokenization
tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
# Testing conversion to ids without special tokens
ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
# Testing conversion to ids with special tokens
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
ids = tokenizer.encode(sequence, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
# Testing the unknown token
input_tokens = tokens + [rust_tokenizer.unk_token]
input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16]
self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_pretokenized_inputs(self, *args, **kwargs):
# It's very difficult to mix/test pretokenization with byte-level
# And get both CLIP and Roberta to work at the same time (mostly an issue of adding a space before the string)
pass
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Simple input text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
s = "This is a simple input" text_tokenized_s = tokenizer_s.tokenize(text)
s2 = ["This is a simple input 1", "This is a simple input 2"] text_tokenized_r = tokenizer_r.tokenize(text)
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input self.assertListEqual(text_tokenized_s, text_tokenized_r)
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") # with Tilde) encoded in 2 different ways
text = "xa\u0303y" + " " + "x\xe3y"
text_tokenized_s = tokenizer_s.tokenize(text)
text_tokenized_r = tokenizer_r.tokenize(text)
# Pair input self.assertListEqual(text_tokenized_s, text_tokenized_r)
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input # Test that the tokenization is identical on unicode of space type
self.assertRaises( spaces_unicodes = [
ValueError, "\u0009", # (horizontal tab, '\t')
tokenizer_r.batch_encode_plus, "\u000B", # (vertical tab)
p2, "\u000C", # (form feed)
max_length=max_length, "\u0020", # (space, ' ')
padding="max_length", "\u200E", # (left-to-right mark):w
) "\u200F", # (right-to-left mark)
]
def test_add_tokens_tokenizer(self): for unicode_seq in spaces_unicodes:
tokenizers = self.get_tokenizers(do_lower_case=False) text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
for tokenizer in tokenizers: text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
with self.subTest(f"{tokenizer.__class__.__name__}"):
vocab_size = tokenizer.vocab_size self.assertListEqual(text_tokenized_s, text_tokenized_r)
all_size = len(tokenizer)
# Test that the tokenization is identical on unicode of line break type
self.assertNotEqual(vocab_size, 0) line_break_unicodes = [
"\u000A", # (line feed, '\n')
# We usually have added tokens from the start in tests because our vocab fixtures are "\r\n", # (carriage return and line feed, '\r\n')
# smaller than the original vocabs - let's not assert this "\u000D", # (carriage return, '\r')
# self.assertEqual(vocab_size, all_size) "\r", # (carriage return, '\r')
"\u000D", # (carriage return, '\r')
"\u2028", # (line separator)
"\u2029", # (paragraph separator)
# "\u0085", # (next line)
]
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"] # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
added_toks = tokenizer.add_tokens(new_toks) # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
vocab_size_2 = tokenizer.vocab_size # space (and thus into an empty list).
all_size_2 = len(tokenizer)
self.assertNotEqual(vocab_size_2, 0) for unicode_seq in line_break_unicodes:
self.assertEqual(vocab_size, vocab_size_2) text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
self.assertEqual(added_toks, len(new_toks)) text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
self.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) self.assertListEqual(text_tokenized_s, text_tokenized_r)
self.assertGreaterEqual(len(tokens), 4) def test_offsets_mapping_with_different_add_prefix_space_argument(self):
self.assertGreater(tokens[0], tokenizer.vocab_size - 1) # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}"
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} tokenizer_r = self.rust_tokenizer_class.from_pretrained(
added_toks_2 = tokenizer.add_special_tokens(new_toks_2) pretrained_name,
vocab_size_3 = tokenizer.vocab_size use_fast=True,
all_size_3 = len(tokenizer) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
)
self.assertNotEqual(vocab_size_3, 0) text = f" {text}"
self.assertEqual(vocab_size, vocab_size_3)
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode( tokenizer_r = self.rust_tokenizer_class.from_pretrained(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False pretrained_name,
use_fast=True,
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
self.assertGreaterEqual(len(tokens), 6) def test_log_warning(self):
self.assertGreater(tokens[0], tokenizer.vocab_size - 1) # Test related to the breaking change introduced in transformers v4.17.0
self.assertGreater(tokens[0], tokens[1]) # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) with self.assertRaises(ValueError) as context:
self.assertGreater(tokens[-2], tokens[-3]) self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")
self.assertEqual(tokens[0], tokenizer.eos_token_id)
# padding is very hacky in CLIPTokenizer, pad_token_id is always 0 self.assertTrue(
# so skip this check context.exception.args[0].startswith(
# self.assertEqual(tokens[-2], tokenizer.pad_token_id) "The `backend_tokenizer` provided does not match the expected format."
)
)
@require_ftfy
def test_tokenization_python_rust_equals(self):
super().test_tokenization_python_rust_equals()
# overwrite common test
def test_added_tokens_do_lower_case(self):
# CLIP always lower cases letters
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment