"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "b844f8a9ab5bbbbf8001e2d83c75e85eaf40549c"
Unverified Commit 39db2f3c authored by Bram Vanroy's avatar Bram Vanroy Committed by GitHub
Browse files

Allow local_files_only for fast pretrained tokenizers (#13225)

* allow local_files_only for fast pretrained tokenizers

* make style
parent 2772d3e7
...@@ -1654,6 +1654,7 @@ def get_list_of_files( ...@@ -1654,6 +1654,7 @@ def get_list_of_files(
path_or_repo: Union[str, os.PathLike], path_or_repo: Union[str, os.PathLike],
revision: Optional[str] = None, revision: Optional[str] = None,
use_auth_token: Optional[Union[bool, str]] = None, use_auth_token: Optional[Union[bool, str]] = None,
local_files_only: bool = False,
) -> List[str]: ) -> List[str]:
""" """
Gets the list of files inside :obj:`path_or_repo`. Gets the list of files inside :obj:`path_or_repo`.
...@@ -1668,6 +1669,8 @@ def get_list_of_files( ...@@ -1668,6 +1669,8 @@ def get_list_of_files(
use_auth_token (:obj:`str` or `bool`, `optional`): use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only rely on local files and not to attempt to download any files.
Returns: Returns:
:obj:`List[str]`: The list of files available in :obj:`path_or_repo`. :obj:`List[str]`: The list of files available in :obj:`path_or_repo`.
...@@ -1681,7 +1684,7 @@ def get_list_of_files( ...@@ -1681,7 +1684,7 @@ def get_list_of_files(
return list_of_files return list_of_files
# Can't grab the files if we are on offline mode. # Can't grab the files if we are on offline mode.
if is_offline_mode(): if is_offline_mode() or local_files_only:
return [] return []
# Otherwise we grab the token and use the model_info method. # Otherwise we grab the token and use the model_info method.
......
...@@ -1566,6 +1566,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1566,6 +1566,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
use_auth_token (:obj:`str` or `bool`, `optional`): use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only rely on local files and not to attempt to download any files.
revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
...@@ -1645,7 +1647,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1645,7 +1647,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
else: else:
# At this point pretrained_model_name_or_path is either a directory or a model identifier name # At this point pretrained_model_name_or_path is either a directory or a model identifier name
fast_tokenizer_file = get_fast_tokenizer_file( fast_tokenizer_file = get_fast_tokenizer_file(
pretrained_model_name_or_path, revision=revision, use_auth_token=use_auth_token pretrained_model_name_or_path,
revision=revision,
use_auth_token=use_auth_token,
local_files_only=local_files_only,
) )
additional_files_names = { additional_files_names = {
"added_tokens_file": ADDED_TOKENS_FILE, "added_tokens_file": ADDED_TOKENS_FILE,
...@@ -3389,6 +3394,7 @@ def get_fast_tokenizer_file( ...@@ -3389,6 +3394,7 @@ def get_fast_tokenizer_file(
path_or_repo: Union[str, os.PathLike], path_or_repo: Union[str, os.PathLike],
revision: Optional[str] = None, revision: Optional[str] = None,
use_auth_token: Optional[Union[bool, str]] = None, use_auth_token: Optional[Union[bool, str]] = None,
local_files_only: bool = False,
) -> str: ) -> str:
""" """
Get the tokenizer file to use for this version of transformers. Get the tokenizer file to use for this version of transformers.
...@@ -3403,12 +3409,16 @@ def get_fast_tokenizer_file( ...@@ -3403,12 +3409,16 @@ def get_fast_tokenizer_file(
use_auth_token (:obj:`str` or `bool`, `optional`): use_auth_token (:obj:`str` or `bool`, `optional`):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only rely on local files and not to attempt to download any files.
Returns: Returns:
:obj:`str`: The tokenizer file to use. :obj:`str`: The tokenizer file to use.
""" """
# Inspect all files from the repo/folder. # Inspect all files from the repo/folder.
all_files = get_list_of_files(path_or_repo, revision=revision, use_auth_token=use_auth_token) all_files = get_list_of_files(
path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only
)
tokenizer_files_map = {} tokenizer_files_map = {}
for file_name in all_files: for file_name in all_files:
search = _re_tokenizer_file.search(file_name) search = _re_tokenizer_file.search(file_name)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment