Fix tokenizer load from one file (#19073)

* Fix tokenizer load from one file * Add a test * Style Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>

Fix tokenizer load from one file (#19073)
* Fix tokenizer load from one file * Add a test * Style Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
9017ba4c · Sylvain Gugger · GitHub · 773314ab · 9017ba4c · 9017ba4c
Unverified Commit 9017ba4c authored Sep 16, 2022 by Sylvain Gugger Committed by GitHub Sep 16, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 0 deletions

src/transformers/tokenization_utils_base.py src/transformers/tokenization_utils_base.py +2 -0

tests/test_tokenization_common.py tests/test_tokenization_common.py +11 -0

No files found.
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1726,6 +1726,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
+            elif os.path.isfile(file_path):
+                resolved_vocab_files[file_id] = file_path
            elif is_remote_url(file_path):
                resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
            else:

--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -31,6 +31,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union

 from huggingface_hub import HfFolder, delete_repo, set_access_token
+from huggingface_hub.file_download import http_get
 from parameterized import parameterized
 from requests.exceptions import HTTPError
 from transformers import (
@@ -3889,6 +3890,16 @@ class TokenizerUtilTester(unittest.TestCase):
            # This check we did call the fake head request
            mock_head.assert_called()

+    def test_legacy_load_from_one_file(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
+
+            AlbertTokenizer.from_pretrained(tmp_file)
+        finally:
+            os.remove(tmp_file)
+

 @is_staging_test
 class TokenizerPushToHubTester(unittest.TestCase):