Remove special treatment for custom vocab files (#10637)

* Remove special path for custom vocab files * Update src/transformers/tokenization_utils_base.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Expand error message Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Remove special treatment for custom vocab files (#10637)
* Remove special path for custom vocab files * Update src/transformers/tokenization_utils_base.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Expand error message Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
89693e17 · Sylvain Gugger · GitHub · 6d9e11a1 · 89693e17
Unverified Commit 89693e17 authored Mar 11, 2021 by Sylvain Gugger Committed by GitHub Mar 11, 2021
Show whitespace changes
Inline Side-by-side

Showing with 55 additions and 73 deletions

src/transformers/tokenization_utils_base.py src/transformers/tokenization_utils_base.py +55 -73

No files found.
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1601,38 +1601,20 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True

-        s3_models = list(cls.max_model_input_sizes.keys())
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        vocab_files = {}
        init_configuration = {}
-        if pretrained_model_name_or_path in s3_models:
-            # Get the vocabulary from AWS S3 bucket
-            for file_id, map_list in cls.pretrained_vocab_files_map.items():
-                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-            if (
-                cls.pretrained_init_configuration
-                and pretrained_model_name_or_path in cls.pretrained_init_configuration
-            ):
-                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
-        else:
-            # Get the vocabulary from local files
-            logger.info(
-                "Model name '{}' not found in model shortcut name list ({}). "
-                "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
-                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
-                )
-            )

        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
-                        "Calling {}.from_pretrained() with the path to a single file or url is not supported."
-                        "Use a model identifier or the path to a directory instead.".format(cls.__name__)
-                    )
-                logger.warning(
-                    "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(
-                        cls.__name__
+                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
+                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
                )
+            warnings.warn(
+                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
+                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
+                FutureWarning,
            )
            file_id = list(cls.vocab_files_names.keys())[0]
            vocab_files[file_id] = pretrained_model_name_or_path
@@ -1652,7 +1634,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                    else:
                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                    if not os.path.exists(full_file_name):
-                            logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                        logger.info(f"Didn't find file {full_file_name}. We won't load it.")
                        full_file_name = None
                else:
                    full_file_name = hf_bucket_url(
@@ -1672,7 +1654,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            if file_path is None:
                resolved_vocab_files[file_id] = None
            else:
-                try:
                try:
                    resolved_vocab_files[file_id] = cached_path(
                        file_path,
@@ -1683,6 +1664,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                        local_files_only=local_files_only,
                        use_auth_token=use_auth_token,
                    )
+
                except FileNotFoundError as error:
                    if local_files_only:
                        unresolved_files.append(file_id)
@@ -1715,9 +1697,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                continue

            if file_path == resolved_vocab_files[file_id]:
-                logger.info("loading file {}".format(file_path))
+                logger.info(f"loading file {file_path}")
            else:
-                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
+                logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")

        return cls._from_pretrained(
            resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs