Unverified Commit 39db2f3c authored by Bram Vanroy's avatar Bram Vanroy Committed by GitHub
Browse files

Allow local_files_only for fast pretrained tokenizers (#13225)

* allow local_files_only for fast pretrained tokenizers

* make style
parent 2772d3e7
......@@ -1654,6 +1654,7 @@ def get_list_of_files(
path_or_repo: Union[str, os.PathLike],
revision: Optional[str] = None,
use_auth_token: Optional[Union[bool, str]] = None,
local_files_only: bool = False,
) -> List[str]:
"""
Gets the list of files inside :obj:`path_or_repo`.
......@@ -1668,6 +1669,8 @@ def get_list_of_files(
use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only rely on local files and not to attempt to download any files.
Returns:
:obj:`List[str]`: The list of files available in :obj:`path_or_repo`.
......@@ -1681,7 +1684,7 @@ def get_list_of_files(
return list_of_files
# Can't grab the files if we are on offline mode.
if is_offline_mode():
if is_offline_mode() or local_files_only:
return []
# Otherwise we grab the token and use the model_info method.
......
......@@ -1566,6 +1566,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only rely on local files and not to attempt to download any files.
revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
......@@ -1645,7 +1647,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
else:
# At this point pretrained_model_name_or_path is either a directory or a model identifier name
fast_tokenizer_file = get_fast_tokenizer_file(
pretrained_model_name_or_path, revision=revision, use_auth_token=use_auth_token
pretrained_model_name_or_path,
revision=revision,
use_auth_token=use_auth_token,
local_files_only=local_files_only,
)
additional_files_names = {
"added_tokens_file": ADDED_TOKENS_FILE,
......@@ -3389,6 +3394,7 @@ def get_fast_tokenizer_file(
path_or_repo: Union[str, os.PathLike],
revision: Optional[str] = None,
use_auth_token: Optional[Union[bool, str]] = None,
local_files_only: bool = False,
) -> str:
"""
Get the tokenizer file to use for this version of transformers.
......@@ -3403,12 +3409,16 @@ def get_fast_tokenizer_file(
use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only rely on local files and not to attempt to download any files.
Returns:
:obj:`str`: The tokenizer file to use.
"""
# Inspect all files from the repo/folder.
all_files = get_list_of_files(path_or_repo, revision=revision, use_auth_token=use_auth_token)
all_files = get_list_of_files(
path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only
)
tokenizer_files_map = {}
for file_name in all_files:
search = _re_tokenizer_file.search(file_name)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment