Tokenizer from_pretrained should not use local files named like tokenizer files (#19626)

3e490020 · Sylvain Gugger · GitHub · 8fcf5626 · 3e490020 · 3e490020
Unverified Commit 3e490020 authored Oct 14, 2022 by Sylvain Gugger Committed by GitHub Oct 14, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 4 deletions

src/transformers/tokenization_utils_base.py src/transformers/tokenization_utils_base.py +7 -4

tests/test_tokenization_common.py tests/test_tokenization_common.py +16 -0

No files found.
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1670,6 +1670,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        init_configuration = {}
        is_local = os.path.isdir(pretrained_model_name_or_path)
+        single_file_id = None
        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
@@ -1684,6 +1685,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            file_id = list(cls.vocab_files_names.keys())[0]
            vocab_files[file_id] = pretrained_model_name_or_path
+            single_file_id = file_id
        else:
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            additional_files_names = {
@@ -1726,10 +1728,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
-            elif os.path.isfile(file_path):
+            elif single_file_id == file_id:
-                resolved_vocab_files[file_id] = file_path
+                if os.path.isfile(file_path):
-            elif is_remote_url(file_path):
+                    resolved_vocab_files[file_id] = file_path
-                resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
+                elif is_remote_url(file_path):
+                    resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
            else:
                resolved_vocab_files[file_id] = cached_file(
                    pretrained_model_name_or_path,

--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -3920,6 +3920,22 @@ class TokenizerUtilTester(unittest.TestCase):
        finally:
            os.remove(tmp_file)
+        # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
+        # the current folder and have the right name.
+        if os.path.isfile("tokenizer.json"):
+            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
+            return
+        try:
+            with open("tokenizer.json", "wb") as f:
+                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
+            self.assertEqual(tokenizer.vocab_size, 1000)
+            # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
+        finally:
+            os.remove("tokenizer.json")
    def test_legacy_load_from_url(self):
        # This test is for deprecated behavior and can be removed in v5
        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")