Unverified Commit 9017ba4c authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fix tokenizer load from one file (#19073)



* Fix tokenizer load from one file

* Add a test

* Style
Co-authored-by: default avatarLysandre <lysandre.debut@reseau.eseo.fr>
parent 773314ab
......@@ -1726,6 +1726,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
for file_id, file_path in vocab_files.items():
if file_path is None:
resolved_vocab_files[file_id] = None
elif os.path.isfile(file_path):
resolved_vocab_files[file_id] = file_path
elif is_remote_url(file_path):
resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
else:
......
......@@ -31,6 +31,7 @@ from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
from huggingface_hub import HfFolder, delete_repo, set_access_token
from huggingface_hub.file_download import http_get
from parameterized import parameterized
from requests.exceptions import HTTPError
from transformers import (
......@@ -3889,6 +3890,16 @@ class TokenizerUtilTester(unittest.TestCase):
# This check we did call the fake head request
mock_head.assert_called()
def test_legacy_load_from_one_file(self):
try:
tmp_file = tempfile.mktemp()
with open(tmp_file, "wb") as f:
http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
AlbertTokenizer.from_pretrained(tmp_file)
finally:
os.remove(tmp_file)
@is_staging_test
class TokenizerPushToHubTester(unittest.TestCase):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment