Unverified Commit 3005f965 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Save `Processor` (#27761)



* save processor

* Update tests/models/auto/test_processor_auto.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Update tests/test_processing_common.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent 98dda8ed
...@@ -25,8 +25,9 @@ from ...configuration_utils import PretrainedConfig ...@@ -25,8 +25,9 @@ from ...configuration_utils import PretrainedConfig
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
from ...feature_extraction_utils import FeatureExtractionMixin from ...feature_extraction_utils import FeatureExtractionMixin
from ...image_processing_utils import ImageProcessingMixin from ...image_processing_utils import ImageProcessingMixin
from ...processing_utils import ProcessorMixin
from ...tokenization_utils import TOKENIZER_CONFIG_FILE from ...tokenization_utils import TOKENIZER_CONFIG_FILE
from ...utils import FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, get_file_from_repo, logging
from .auto_factory import _LazyAutoMapping from .auto_factory import _LazyAutoMapping
from .configuration_auto import ( from .configuration_auto import (
CONFIG_MAPPING_NAMES, CONFIG_MAPPING_NAMES,
...@@ -227,27 +228,41 @@ class AutoProcessor: ...@@ -227,27 +228,41 @@ class AutoProcessor:
processor_class = None processor_class = None
processor_auto_map = None processor_auto_map = None
# First, let's see if we have a preprocessor config. # First, let's see if we have a processor or preprocessor config.
# Filter the kwargs for `get_file_from_repo`. # Filter the kwargs for `get_file_from_repo`.
get_file_from_repo_kwargs = { get_file_from_repo_kwargs = {
key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs
} }
# Let's start by checking whether the processor class is saved in an image processor
preprocessor_config_file = get_file_from_repo( # Let's start by checking whether the processor class is saved in a processor config
pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs processor_config_file = get_file_from_repo(
pretrained_model_name_or_path, PROCESSOR_NAME, **get_file_from_repo_kwargs
) )
if preprocessor_config_file is not None: if processor_config_file is not None:
config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs) config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
processor_class = config_dict.get("processor_class", None) processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}): if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"] processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
# If not found, let's check whether the processor class is saved in a feature extractor config if processor_class is None:
if preprocessor_config_file is not None and processor_class is None: # If not found, let's check whether the processor class is saved in an image processor config
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) preprocessor_config_file = get_file_from_repo(
processor_class = config_dict.get("processor_class", None) pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs
if "AutoProcessor" in config_dict.get("auto_map", {}): )
processor_auto_map = config_dict["auto_map"]["AutoProcessor"] if preprocessor_config_file is not None:
config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
# If not found, let's check whether the processor class is saved in a feature extractor config
if preprocessor_config_file is not None and processor_class is None:
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
pretrained_model_name_or_path, **kwargs
)
processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
if processor_class is None: if processor_class is None:
# Next, let's check whether the processor class is saved in a tokenizer # Next, let's check whether the processor class is saved in a tokenizer
......
...@@ -16,14 +16,28 @@ ...@@ -16,14 +16,28 @@
Processing saving/loading class for common processors. Processing saving/loading class for common processors.
""" """
import copy
import inspect
import json
import os import os
import warnings import warnings
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Any, Dict, Optional, Tuple, Union
from .dynamic_module_utils import custom_object_save from .dynamic_module_utils import custom_object_save
from .tokenization_utils_base import PreTrainedTokenizerBase from .tokenization_utils_base import PreTrainedTokenizerBase
from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging from .utils import (
PROCESSOR_NAME,
PushToHubMixin,
add_model_info_to_auto_map,
cached_file,
copy_func,
direct_transformers_import,
download_url,
is_offline_mode,
is_remote_url,
logging,
)
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
...@@ -85,10 +99,70 @@ class ProcessorMixin(PushToHubMixin): ...@@ -85,10 +99,70 @@ class ProcessorMixin(PushToHubMixin):
setattr(self, attribute_name, arg) setattr(self, attribute_name, arg)
def to_dict(self) -> Dict[str, Any]:
"""
Serializes this instance to a Python dictionary.
Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
"""
output = copy.deepcopy(self.__dict__)
# Get the kwargs in `__init__`.
sig = inspect.signature(self.__init__)
# Only save the attributes that are presented in the kwargs of `__init__`.
attrs_to_save = sig.parameters
# Don't save attributes like `tokenizer`, `image processor` etc.
attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]
# extra attributes to be kept
attrs_to_save += ["auto_map"]
output = {k: v for k, v in output.items() if k in attrs_to_save}
output["processor_class"] = self.__class__.__name__
if "tokenizer" in output:
del output["tokenizer"]
if "image_processor" in output:
del output["image_processor"]
if "feature_extractor" in output:
del output["feature_extractor"]
# Some attributes have different names but containing objects that are not simple strings
output = {
k: v
for k, v in output.items()
if not (isinstance(v, PushToHubMixin) or v.__class__.__name__ == "BeamSearchDecoderCTC")
}
return output
def to_json_string(self) -> str:
"""
Serializes this instance to a JSON string.
Returns:
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
"""
dictionary = self.to_dict()
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
"""
Save this instance to a JSON file.
Args:
json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this processor instance's parameters will be saved.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string())
def __repr__(self): def __repr__(self):
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes] attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
attributes_repr = "\n".join(attributes_repr) attributes_repr = "\n".join(attributes_repr)
return f"{self.__class__.__name__}:\n{attributes_repr}" return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
""" """
...@@ -139,6 +213,7 @@ class ProcessorMixin(PushToHubMixin): ...@@ -139,6 +213,7 @@ class ProcessorMixin(PushToHubMixin):
if self._auto_class is not None: if self._auto_class is not None:
attrs = [getattr(self, attribute_name) for attribute_name in self.attributes] attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs] configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
configs.append(self)
custom_object_save(self, save_directory, config=configs) custom_object_save(self, save_directory, config=configs)
for attribute_name in self.attributes: for attribute_name in self.attributes:
...@@ -156,6 +231,12 @@ class ProcessorMixin(PushToHubMixin): ...@@ -156,6 +231,12 @@ class ProcessorMixin(PushToHubMixin):
if isinstance(attribute, PreTrainedTokenizerBase): if isinstance(attribute, PreTrainedTokenizerBase):
del attribute.init_kwargs["auto_map"] del attribute.init_kwargs["auto_map"]
# If we save using the predefined names, we can load using `from_pretrained`
output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
self.to_json_file(output_processor_file)
logger.info(f"processor saved in {output_processor_file}")
if push_to_hub: if push_to_hub:
self._upload_modified_files( self._upload_modified_files(
save_directory, save_directory,
...@@ -165,6 +246,150 @@ class ProcessorMixin(PushToHubMixin): ...@@ -165,6 +246,150 @@ class ProcessorMixin(PushToHubMixin):
token=kwargs.get("token"), token=kwargs.get("token"),
) )
return [output_processor_file]
@classmethod
def get_processor_dict(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
subfolder = kwargs.pop("subfolder", "")
from_pipeline = kwargs.pop("_from_pipeline", None)
from_auto_class = kwargs.pop("_from_auto", False)
user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
if from_pipeline is not None:
user_agent["using_pipeline"] = from_pipeline
if is_offline_mode() and not local_files_only:
logger.info("Offline mode: forcing local_files_only=True")
local_files_only = True
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
is_local = os.path.isdir(pretrained_model_name_or_path)
if os.path.isdir(pretrained_model_name_or_path):
processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME)
if os.path.isfile(pretrained_model_name_or_path):
resolved_processor_file = pretrained_model_name_or_path
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
processor_file = pretrained_model_name_or_path
resolved_processor_file = download_url(pretrained_model_name_or_path)
else:
processor_file = PROCESSOR_NAME
try:
# Load from local folder or from cache or download from model Hub and cache
resolved_processor_file = cached_file(
pretrained_model_name_or_path,
processor_file,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
token=token,
user_agent=user_agent,
revision=revision,
subfolder=subfolder,
)
except EnvironmentError:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
# the original exception.
raise
except Exception:
# For any other exception, we throw a generic error.
raise EnvironmentError(
f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
f" directory containing a {PROCESSOR_NAME} file"
)
try:
# Load processor dict
with open(resolved_processor_file, "r", encoding="utf-8") as reader:
text = reader.read()
processor_dict = json.loads(text)
except json.JSONDecodeError:
raise EnvironmentError(
f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
)
if is_local:
logger.info(f"loading configuration file {resolved_processor_file}")
else:
logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
if "auto_map" in processor_dict and not is_local:
processor_dict["auto_map"] = add_model_info_to_auto_map(
processor_dict["auto_map"], pretrained_model_name_or_path
)
return processor_dict, kwargs
@classmethod
def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
"""
Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
Args:
processor_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the processor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the
[`~processing_utils.ProcessingMixin.to_dict`] method.
kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the processor object.
Returns:
[`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
parameters.
"""
processor_dict = processor_dict.copy()
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
# Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
# We have to pop up some unused (but specific) arguments to make it work.
if "processor_class" in processor_dict:
del processor_dict["processor_class"]
if "auto_map" in processor_dict:
del processor_dict["auto_map"]
processor = cls(*args, **processor_dict)
# Update processor with kwargs if needed
for key in set(kwargs.keys()):
if hasattr(processor, key):
setattr(processor, key, kwargs.pop(key))
logger.info(f"Processor {processor}")
if return_unused_kwargs:
return processor, kwargs
else:
return processor
@classmethod @classmethod
def from_pretrained( def from_pretrained(
cls, cls,
...@@ -226,7 +451,19 @@ class ProcessorMixin(PushToHubMixin): ...@@ -226,7 +451,19 @@ class ProcessorMixin(PushToHubMixin):
kwargs["token"] = token kwargs["token"] = token
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(*args)
# Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
# updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
# However, for models added in the future, we won't get the expected error if this file is missing.
try:
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
except EnvironmentError as e:
if "does not appear to have a file named processor_config.json." in str(e):
processor_dict, kwargs = {}, kwargs
else:
raise
return cls.from_args_and_dict(args, processor_dict, **kwargs)
@classmethod @classmethod
def register_for_auto_class(cls, auto_class="AutoProcessor"): def register_for_auto_class(cls, auto_class="AutoProcessor"):
......
...@@ -217,6 +217,7 @@ SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json" ...@@ -217,6 +217,7 @@ SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
CONFIG_NAME = "config.json" CONFIG_NAME = "config.json"
FEATURE_EXTRACTOR_NAME = "preprocessor_config.json" FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
PROCESSOR_NAME = "processor_config.json"
GENERATION_CONFIG_NAME = "generation_config.json" GENERATION_CONFIG_NAME = "generation_config.json"
MODEL_CARD_NAME = "modelcard.json" MODEL_CARD_NAME = "modelcard.json"
......
...@@ -42,7 +42,7 @@ from transformers import ( ...@@ -42,7 +42,7 @@ from transformers import (
) )
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils")) sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
...@@ -91,6 +91,28 @@ class AutoFeatureExtractorTest(unittest.TestCase): ...@@ -91,6 +91,28 @@ class AutoFeatureExtractorTest(unittest.TestCase):
self.assertIsInstance(processor, Wav2Vec2Processor) self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in tokenizer config
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_feat_extr_processor_class(self): def test_processor_from_feat_extr_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor() feature_extractor = Wav2Vec2FeatureExtractor()
...@@ -101,6 +123,14 @@ class AutoFeatureExtractorTest(unittest.TestCase): ...@@ -101,6 +123,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
# save in new folder # save in new folder
processor.save_pretrained(tmpdirname) processor.save_pretrained(tmpdirname)
# drop `processor_class` in processor
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
f.write(json.dumps(config_dict))
# drop `processor_class` in tokenizer # drop `processor_class` in tokenizer
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f: with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
config_dict = json.load(f) config_dict = json.load(f)
...@@ -123,6 +153,14 @@ class AutoFeatureExtractorTest(unittest.TestCase): ...@@ -123,6 +153,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
# save in new folder # save in new folder
processor.save_pretrained(tmpdirname) processor.save_pretrained(tmpdirname)
# drop `processor_class` in processor
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
f.write(json.dumps(config_dict))
# drop `processor_class` in feature extractor # drop `processor_class` in feature extractor
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f: with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f:
config_dict = json.load(f) config_dict = json.load(f)
...@@ -270,6 +308,45 @@ class AutoFeatureExtractorTest(unittest.TestCase): ...@@ -270,6 +308,45 @@ class AutoFeatureExtractorTest(unittest.TestCase):
if CustomConfig in PROCESSOR_MAPPING._extra_content: if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig] del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_processor_with_extra_attributes(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
pass
class NewTokenizer(BertTokenizer):
pass
class NewProcessor(ProcessorMixin):
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
super().__init__(feature_extractor, tokenizer)
self.processor_attr_1 = processor_attr_1
self.processor_attr_2 = processor_attr_2
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor", processor_attr_2=False
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertEqual(processor.processor_attr_1, 1)
self.assertEqual(processor.processor_attr_2, False)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_auto_processor_creates_tokenizer(self): def test_auto_processor_creates_tokenizer(self):
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert") processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(processor.__class__.__name__, "BertTokenizerFast") self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
......
...@@ -26,6 +26,8 @@ from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES ...@@ -26,6 +26,8 @@ from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision from transformers.testing_utils import require_vision
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
...@@ -34,7 +36,9 @@ if is_vision_available(): ...@@ -34,7 +36,9 @@ if is_vision_available():
@require_vision @require_vision
class CLIPProcessorTest(unittest.TestCase): class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = CLIPProcessor
def setUp(self): def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
......
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import tempfile
import unittest
from transformers import CLIPTokenizerFast, ProcessorMixin
from transformers.models.auto.processing_auto import processor_class_from_name
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_tokenizers,
require_torch,
require_vision,
)
from transformers.utils import is_vision_available
if is_vision_available():
from transformers import CLIPImageProcessor
@require_torch
class ProcessorTesterMixin:
processor_class = None
def prepare_processor_dict(self):
return {}
def get_component(self, attribute, **kwargs):
assert attribute in self.processor_class.attributes
component_class_name = getattr(self.processor_class, f"{attribute}_class")
if isinstance(component_class_name, tuple):
component_class_name = component_class_name[0]
component_class = processor_class_from_name(component_class_name)
component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa
return component
def prepare_components(self):
components = {}
for attribute in self.processor_class.attributes:
component = self.get_component(attribute)
components[attribute] = component
return components
def get_processor(self):
components = self.prepare_components()
processor = self.processor_class(**components, **self.prepare_processor_dict())
return processor
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
for key, value in self.prepare_processor_dict().items():
self.assertEqual(obj[key], value)
self.assertEqual(getattr(processor, key, None), value)
def test_processor_from_and_save_pretrained(self):
processor_first = self.get_processor()
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = processor_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
processor_second = self.processor_class.from_pretrained(tmpdirname)
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
class MyProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
super().__init__(image_processor, tokenizer)
self.processor_attr_1 = processor_attr_1
self.processor_attr_2 = processor_attr_2
@require_tokenizers
@require_vision
class ProcessorTest(unittest.TestCase):
processor_class = MyProcessor
def prepare_processor_dict(self):
return {"processor_attr_1": 1, "processor_attr_2": False}
def get_processor(self):
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
return processor
def test_processor_to_json_string(self):
processor = self.get_processor()
obj = json.loads(processor.to_json_string())
for key, value in self.prepare_processor_dict().items():
self.assertEqual(obj[key], value)
self.assertEqual(getattr(processor, key, None), value)
def test_processor_from_and_save_pretrained(self):
processor_first = self.get_processor()
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = processor_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
processor_second = self.processor_class.from_pretrained(tmpdirname)
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment