Unverified Commit 786ced36 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add versioning system to fast tokenizer files (#12713)



* Add versioning system to fast tokenizer files

* Deal with offline mode

* Use staging env in tests

* Style

* Apply suggestions from code review
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>

* Style
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
parent 037bdf82
...@@ -1642,6 +1642,53 @@ def get_from_cache( ...@@ -1642,6 +1642,53 @@ def get_from_cache(
return cache_path return cache_path
def get_list_of_files(
path_or_repo: Union[str, os.PathLike],
revision: Optional[str] = None,
use_auth_token: Optional[Union[bool, str]] = None,
) -> List[str]:
"""
Gets the list of files inside :obj:`path_or_repo`.
Args:
path_or_repo (:obj:`str` or :obj:`os.PathLike`):
Can be either the id of a repo on huggingface.co or a path to a `directory`.
revision (:obj:`str`, `optional`, defaults to :obj:`"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
identifier allowed by git.
use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
Returns:
:obj:`List[str]`: The list of files available in :obj:`path_or_repo`.
"""
path_or_repo = str(path_or_repo)
# If path_or_repo is a folder, we just return what is inside (subdirectories included).
if os.path.isdir(path_or_repo):
list_of_files = []
for path, dir_names, file_names in os.walk(path_or_repo):
list_of_files.extend([os.path.join(path, f) for f in file_names])
return list_of_files
# Can't grab the files if we are on offline mode.
if is_offline_mode():
return []
# Otherwise we grab the token and use the model_info method.
if isinstance(use_auth_token, str):
token = use_auth_token
elif use_auth_token is True:
token = HfFolder.get_token()
else:
token = None
model_info = HfApi(endpoint=HUGGINGFACE_CO_RESOLVE_ENDPOINT).model_info(
path_or_repo, revision=revision, token=token
)
return [f.rfilename for f in model_info.siblings]
class cached_property(property): class cached_property(property):
""" """
Descriptor that mimics @property but caches output in member variable. Descriptor that mimics @property but caches output in member variable.
......
...@@ -21,6 +21,7 @@ of output with special method for the Fast tokenizers) ...@@ -21,6 +21,7 @@ of output with special method for the Fast tokenizers)
import copy import copy
import json import json
import os import os
import re
import warnings import warnings
from collections import OrderedDict, UserDict from collections import OrderedDict, UserDict
from contextlib import contextmanager from contextlib import contextmanager
...@@ -28,9 +29,11 @@ from dataclasses import dataclass, field ...@@ -28,9 +29,11 @@ from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
import numpy as np import numpy as np
from packaging import version
import requests import requests
from . import __version__
from .file_utils import ( from .file_utils import (
ExplicitEnum, ExplicitEnum,
PaddingStrategy, PaddingStrategy,
...@@ -44,6 +47,7 @@ from .file_utils import ( ...@@ -44,6 +47,7 @@ from .file_utils import (
add_end_docstrings, add_end_docstrings,
cached_path, cached_path,
copy_func, copy_func,
get_list_of_files,
hf_bucket_url, hf_bucket_url,
is_flax_available, is_flax_available,
is_offline_mode, is_offline_mode,
...@@ -115,6 +119,7 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json" ...@@ -115,6 +119,7 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
FULL_TOKENIZER_FILE = "tokenizer.json" FULL_TOKENIZER_FILE = "tokenizer.json"
_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")
class TruncationStrategy(ExplicitEnum): class TruncationStrategy(ExplicitEnum):
...@@ -1639,11 +1644,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1639,11 +1644,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
vocab_files[file_id] = pretrained_model_name_or_path vocab_files[file_id] = pretrained_model_name_or_path
else: else:
# At this point pretrained_model_name_or_path is either a directory or a model identifier name # At this point pretrained_model_name_or_path is either a directory or a model identifier name
fast_tokenizer_file = get_fast_tokenizer_file(
pretrained_model_name_or_path, revision=revision, use_auth_token=use_auth_token
)
additional_files_names = { additional_files_names = {
"added_tokens_file": ADDED_TOKENS_FILE, "added_tokens_file": ADDED_TOKENS_FILE,
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
"tokenizer_config_file": TOKENIZER_CONFIG_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
"tokenizer_file": FULL_TOKENIZER_FILE, "tokenizer_file": fast_tokenizer_file,
} }
# Look for the tokenizer files # Look for the tokenizer files
for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
...@@ -3374,6 +3382,51 @@ For a more complete example, see the implementation of `prepare_seq2seq_batch`. ...@@ -3374,6 +3382,51 @@ For a more complete example, see the implementation of `prepare_seq2seq_batch`.
return model_inputs return model_inputs
def get_fast_tokenizer_file(
path_or_repo: Union[str, os.PathLike],
revision: Optional[str] = None,
use_auth_token: Optional[Union[bool, str]] = None,
) -> str:
"""
Get the tokenizer file to use for this version of transformers.
Args:
path_or_repo (:obj:`str` or :obj:`os.PathLike`):
Can be either the id of a repo on huggingface.co or a path to a `directory`.
revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
identifier allowed by git.
use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
Returns:
:obj:`str`: The tokenizer file to use.
"""
# Inspect all files from the repo/folder.
all_files = get_list_of_files(path_or_repo, revision=revision, use_auth_token=use_auth_token)
tokenizer_files_map = {}
for file_name in all_files:
search = _re_tokenizer_file.search(file_name)
if search is not None:
v = search.groups()[0]
tokenizer_files_map[v] = file_name
available_versions = sorted(tokenizer_files_map.keys())
# Defaults to FULL_TOKENIZER_FILE and then try to look at some newer versions.
tokenizer_file = FULL_TOKENIZER_FILE
transformers_version = version.parse(__version__)
for v in available_versions:
if version.parse(v) <= transformers_version:
tokenizer_file = tokenizer_files_map[v]
else:
# No point going further since the versions are sorted.
break
return tokenizer_file
# To update the docstring, we need to copy the method, otherwise we change the original docstring. # To update the docstring, we need to copy the method, otherwise we change the original docstring.
PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub) PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub)
PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format( PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format(
......
...@@ -1071,9 +1071,7 @@ class TrainingArguments: ...@@ -1071,9 +1071,7 @@ class TrainingArguments:
Get number of steps used for a linear warmup. Get number of steps used for a linear warmup.
""" """
warmup_steps = ( warmup_steps = (
self.warmup_steps self.warmup_steps if self.warmup_steps > 0 else math.ceil(num_training_steps * self.warmup_ratio)
if self.warmup_steps > 0
else math.ceil(num_training_steps * self.warmup_ratio)
) )
return warmup_steps return warmup_steps
......
...@@ -14,11 +14,13 @@ ...@@ -14,11 +14,13 @@
# limitations under the License. # limitations under the License.
import concurrent.futures import concurrent.futures
import json
import os
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from transformers import PreTrainedTokenizerFast from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers.testing_utils import require_tokenizers from transformers.testing_utils import require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -98,6 +100,51 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -98,6 +100,51 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
self.tmpdirname = tmpdirname_orig self.tmpdirname = tmpdirname_orig
@require_tokenizers
class TokenizerVersioningTest(unittest.TestCase):
def test_local_versioning(self):
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
json_tokenizer = json.loads(tokenizer._tokenizer.to_str())
json_tokenizer["model"]["vocab"]["huggingface"] = len(tokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
json.dump(json_tokenizer, open(os.path.join(tmp_dir, "tokenizer.4.0.0.json"), "w"))
# This should pick the new tokenizer file as the version of Transformers is > 4.0.0
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
self.assertEqual(len(new_tokenizer), len(tokenizer) + 1)
json_tokenizer = json.loads(new_tokenizer._tokenizer.to_str())
self.assertIn("huggingface", json_tokenizer["model"]["vocab"])
# Will need to be adjusted if we reach v42 and this test is still here.
# Should pick the old tokenizer file as the version of Transformers is < 4.0.0
shutil.move(os.path.join(tmp_dir, "tokenizer.4.0.0.json"), os.path.join(tmp_dir, "tokenizer.42.0.0.json"))
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
self.assertEqual(len(new_tokenizer), len(tokenizer))
json_tokenizer = json.loads(new_tokenizer._tokenizer.to_str())
self.assertNotIn("huggingface", json_tokenizer["model"]["vocab"])
def test_repo_versioning(self):
# This repo has two tokenizer files, one for v4.0.0 and above with an added token, one for versions lower.
repo = "sgugger/finetuned-bert-mrpc"
# This should pick the new tokenizer file as the version of Transformers is > 4.0.0
tokenizer = AutoTokenizer.from_pretrained(repo)
self.assertEqual(len(tokenizer), 28997)
json_tokenizer = json.loads(tokenizer._tokenizer.to_str())
self.assertIn("huggingface", json_tokenizer["model"]["vocab"])
# Testing an older version by monkey-patching the version in the module it's used.
import transformers as old_transformers
old_transformers.tokenization_utils_base.__version__ = "3.0.0"
old_tokenizer = old_transformers.models.auto.AutoTokenizer.from_pretrained(repo)
self.assertEqual(len(old_tokenizer), 28996)
json_tokenizer = json.loads(old_tokenizer._tokenizer.to_str())
self.assertNotIn("huggingface", json_tokenizer["model"]["vocab"])
@require_tokenizers @require_tokenizers
class ReduceMutableBorrowTests(unittest.TestCase): class ReduceMutableBorrowTests(unittest.TestCase):
def test_async_share_tokenizer(self): def test_async_share_tokenizer(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment