Unverified Commit a95fd354 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Vision processors - replace FE with IPs (#20590)



* Replace FE references with IPs

* Update processor tests

* Update src/transformers/models/clip/processing_clip.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/clip/processing_clip.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update warning messages v4.27 -> v5

* Fixup

* Update Chinese CLIP processor

* Add feature_extractor property

* Add attributes

* Add tests
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 704027f0
...@@ -287,8 +287,8 @@ class AutoProcessor: ...@@ -287,8 +287,8 @@ class AutoProcessor:
raise ValueError( raise ValueError(
f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a " f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a "
"tokenizer or a feature extractor for this model. Make sure the repository contains the files of at least " "tokenizer, an image processor or a feature extractor for this model. Make sure the repository contains"
"one of those processing classes." "the files of at least one of those processing classes."
) )
@staticmethod @staticmethod
......
...@@ -15,39 +15,56 @@ ...@@ -15,39 +15,56 @@
""" """
Image/Text processor class for Chinese-CLIP Image/Text processor class for Chinese-CLIP
""" """
import warnings
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
class ChineseCLIPProcessor(ProcessorMixin): class ChineseCLIPProcessor(ProcessorMixin):
r""" r"""
Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP feature extractor and a Chinese-CLIP tokenizer into Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
a single processor. single processor.
[`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPFeatureExtractor`] and [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`].
[`BertTokenizerFast`]. See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
information.
Args: Args:
feature_extractor ([`ChineseCLIPFeatureExtractor`]): image_processor ([`ChineseCLIPImageProcessor`]):
The feature extractor is a required input. The image processor is a required input.
tokenizer ([`BertTokenizerFast`]): tokenizer ([`BertTokenizerFast`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
feature_extractor_class = "ChineseCLIPFeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "ChineseCLIPImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast") tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(feature_extractor, tokenizer) if "feature_extractor" in kwargs:
self.current_processor = self.feature_extractor warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(self, text=None, images=None, return_tensors=None, **kwargs): def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
""" """
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
doctsring of the above two methods for more information. of the above two methods for more information.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`): text (`str`, `List[str]`, `List[List[str]]`):
...@@ -84,7 +101,7 @@ class ChineseCLIPProcessor(ProcessorMixin): ...@@ -84,7 +101,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if images is not None: if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs) image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None: if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values encoding["pixel_values"] = image_features.pixel_values
...@@ -111,5 +128,13 @@ class ChineseCLIPProcessor(ProcessorMixin): ...@@ -111,5 +128,13 @@ class ChineseCLIPProcessor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names)) return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
...@@ -15,37 +15,54 @@ ...@@ -15,37 +15,54 @@
""" """
Image/Text processor class for CLIP Image/Text processor class for CLIP
""" """
import warnings
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
class CLIPProcessor(ProcessorMixin): class CLIPProcessor(ProcessorMixin):
r""" r"""
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor. Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
[`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizerFast`]. See the [`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
[`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information. [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
Args: Args:
feature_extractor ([`CLIPFeatureExtractor`]): image_processor ([`CLIPImageProcessor`]):
The feature extractor is a required input. The image processor is a required input.
tokenizer ([`CLIPTokenizerFast`]): tokenizer ([`CLIPTokenizerFast`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
feature_extractor_class = "CLIPFeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, feature_extractor, tokenizer): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(feature_extractor, tokenizer) if "feature_extractor" in kwargs:
self.current_processor = self.feature_extractor warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs): def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
""" """
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
doctsring of the above two methods for more information. of the above two methods for more information.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`): text (`str`, `List[str]`, `List[List[str]]`):
...@@ -82,7 +99,7 @@ class CLIPProcessor(ProcessorMixin): ...@@ -82,7 +99,7 @@ class CLIPProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if images is not None: if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs) image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None: if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values encoding["pixel_values"] = image_features.pixel_values
...@@ -109,5 +126,21 @@ class CLIPProcessor(ProcessorMixin): ...@@ -109,5 +126,21 @@ class CLIPProcessor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names)) return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -15,38 +15,54 @@ ...@@ -15,38 +15,54 @@
""" """
Image/Text processor class for CLIPSeg Image/Text processor class for CLIPSeg
""" """
import warnings
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
class CLIPSegProcessor(ProcessorMixin): class CLIPSegProcessor(ProcessorMixin):
r""" r"""
Constructs a CLIPSeg processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor.
processor.
[`CLIPSegProcessor`] offers all the functionalities of [`ViTFeatureExtractor`] and [`CLIPTokenizerFast`]. See the [`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the
[`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information. [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
Args: Args:
feature_extractor ([`ViTFeatureExtractor`]): image_processor ([`ViTImageProcessor`]):
The feature extractor is a required input. The image processor is a required input.
tokenizer ([`CLIPTokenizerFast`]): tokenizer ([`CLIPTokenizerFast`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
feature_extractor_class = "ViTFeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "ViTImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, feature_extractor, tokenizer): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(feature_extractor, tokenizer) if "feature_extractor" in kwargs:
self.current_processor = self.feature_extractor warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs): def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
""" """
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
ViTFeatureExtractor's [`~ViTFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
doctsring of the above two methods for more information. the above two methods for more information.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`): text (`str`, `List[str]`, `List[List[str]]`):
...@@ -83,7 +99,7 @@ class CLIPSegProcessor(ProcessorMixin): ...@@ -83,7 +99,7 @@ class CLIPSegProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if images is not None: if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs) image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None: if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values encoding["pixel_values"] = image_features.pixel_values
...@@ -106,3 +122,19 @@ class CLIPSegProcessor(ProcessorMixin): ...@@ -106,3 +122,19 @@ class CLIPSegProcessor(ProcessorMixin):
the docstring of this method for more information. the docstring of this method for more information.
""" """
return self.tokenizer.decode(*args, **kwargs) return self.tokenizer.decode(*args, **kwargs)
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -24,7 +24,7 @@ from ...processing_utils import ProcessorMixin ...@@ -24,7 +24,7 @@ from ...processing_utils import ProcessorMixin
class DonutProcessor(ProcessorMixin): class DonutProcessor(ProcessorMixin):
r""" r"""
Constructs a Donut processor which wraps a Donut feature extractor and an XLMRoBERTa tokenizer into a single Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
processor. processor.
[`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and [`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and
...@@ -32,8 +32,8 @@ class DonutProcessor(ProcessorMixin): ...@@ -32,8 +32,8 @@ class DonutProcessor(ProcessorMixin):
[`~DonutProcessor.decode`] for more information. [`~DonutProcessor.decode`] for more information.
Args: Args:
feature_extractor ([`DonutFeatureExtractor`]): image_processor ([`DonutFeatureExtractor`]):
An instance of [`DonutFeatureExtractor`]. The feature extractor is a required input. An instance of [`DonutFeatureExtractor`]. The image processor is a required input.
tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]): tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input. An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
""" """
...@@ -44,7 +44,7 @@ class DonutProcessor(ProcessorMixin): ...@@ -44,7 +44,7 @@ class DonutProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, **kwargs): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs: if "feature_extractor" in kwargs:
warnings.warn( warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`" "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.", " instead.",
FutureWarning, FutureWarning,
) )
...@@ -176,8 +176,15 @@ class DonutProcessor(ProcessorMixin): ...@@ -176,8 +176,15 @@ class DonutProcessor(ProcessorMixin):
@property @property
def feature_extractor_class(self): def feature_extractor_class(self):
warnings.warn( warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`" "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
" instead.",
FutureWarning, FutureWarning,
) )
return self.image_processor_class return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
""" """
Image/Text processor class for FLAVA Image/Text processor class for FLAVA
""" """
import warnings
from typing import List, Optional, Union from typing import List, Optional, Union
from ...image_utils import ImageInput from ...image_utils import ImageInput
...@@ -25,21 +27,36 @@ from ...utils import TensorType ...@@ -25,21 +27,36 @@ from ...utils import TensorType
class FlavaProcessor(ProcessorMixin): class FlavaProcessor(ProcessorMixin):
r""" r"""
Constructs a FLAVA processor which wraps a FLAVA feature extractor and a FLAVA tokenizer into a single processor. Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
[`FlavaProcessor`] offers all the functionalities of [`FlavaFeatureExtractor`] and [`BertTokenizerFast`]. See the [`FlavaProcessor`] offers all the functionalities of [`FlavaFeatureExtractor`] and [`BertTokenizerFast`]. See the
[`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information. [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
Args: Args:
feature_extractor ([`FlavaFeatureExtractor`]): The feature extractor is a required input. image_processor ([`FlavaFeatureExtractor`]): The image processor is a required input.
tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input. tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
""" """
feature_extractor_class = "FlavaFeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "FlavaFeatureExtractor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast") tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(feature_extractor, tokenizer) if "feature_extractor" in kwargs:
self.current_processor = self.feature_extractor warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__( def __call__(
self, self,
...@@ -93,7 +110,7 @@ class FlavaProcessor(ProcessorMixin): ...@@ -93,7 +110,7 @@ class FlavaProcessor(ProcessorMixin):
**kwargs, **kwargs,
) )
if images is not None: if images is not None:
image_features = self.feature_extractor( image_features = self.image_processor(
images, images,
return_image_mask=return_image_mask, return_image_mask=return_image_mask,
return_codebook_pixels=return_codebook_pixels, return_codebook_pixels=return_codebook_pixels,
...@@ -126,5 +143,21 @@ class FlavaProcessor(ProcessorMixin): ...@@ -126,5 +143,21 @@ class FlavaProcessor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names)) return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
""" """
Processor class for LayoutLMv2. Processor class for LayoutLMv2.
""" """
import warnings
from typing import List, Optional, Union from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
...@@ -24,26 +26,44 @@ from ...utils import TensorType ...@@ -24,26 +26,44 @@ from ...utils import TensorType
class LayoutLMv2Processor(ProcessorMixin): class LayoutLMv2Processor(ProcessorMixin):
r""" r"""
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
single processor. single processor.
[`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model. [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
to get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
[`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD). into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args: Args:
feature_extractor (`LayoutLMv2FeatureExtractor`): image_processor (`LayoutLMv2ImageProcessor`):
An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input. An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`): tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input. An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
""" """
feature_extractor_class = "LayoutLMv2FeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv2ImageProcessor"
tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast") tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__( def __call__(
self, self,
images, images,
...@@ -68,37 +88,36 @@ class LayoutLMv2Processor(ProcessorMixin): ...@@ -68,37 +88,36 @@ class LayoutLMv2Processor(ProcessorMixin):
**kwargs **kwargs
) -> BatchEncoding: ) -> BatchEncoding:
""" """
This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
[`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``. arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
Please refer to the docstring of the above two methods for more information. Please refer to the docstring of the above two methods for more information.
""" """
# verify input # verify input
if self.feature_extractor.apply_ocr and (boxes is not None): if self.image_processor.apply_ocr and (boxes is not None):
raise ValueError( raise ValueError(
"You cannot provide bounding boxes " "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
"if you initialized the feature extractor with apply_ocr set to True."
) )
if self.feature_extractor.apply_ocr and (word_labels is not None): if self.image_processor.apply_ocr and (word_labels is not None):
raise ValueError( raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True." "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
) )
if return_overflowing_tokens is True and return_offsets_mapping is False: if return_overflowing_tokens is True and return_offsets_mapping is False:
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.") raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
# first, apply the feature extractor # first, apply the image processor
features = self.feature_extractor(images=images, return_tensors=return_tensors) features = self.image_processor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer # second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None: if text is not None and self.image_processor.apply_ocr and text_pair is None:
if isinstance(text, str): if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension) text = [text] # add batch dimension (as the image processor always adds a batch dimension)
text_pair = features["words"] text_pair = features["words"]
encoded_inputs = self.tokenizer( encoded_inputs = self.tokenizer(
...@@ -162,3 +181,19 @@ class LayoutLMv2Processor(ProcessorMixin): ...@@ -162,3 +181,19 @@ class LayoutLMv2Processor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"] return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
""" """
Processor class for LayoutLMv3. Processor class for LayoutLMv3.
""" """
import warnings
from typing import List, Optional, Union from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
...@@ -24,26 +26,44 @@ from ...utils import TensorType ...@@ -24,26 +26,44 @@ from ...utils import TensorType
class LayoutLMv3Processor(ProcessorMixin): class LayoutLMv3Processor(ProcessorMixin):
r""" r"""
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 feature extractor and a LayoutLMv3 tokenizer into a Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
single processor. single processor.
[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model. [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv3FeatureExtractor`] to resize and normalize document images, and optionally applies OCR to It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD). into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args: Args:
feature_extractor (`LayoutLMv3FeatureExtractor`): image_processor (`LayoutLMv3ImageProcessor`):
An instance of [`LayoutLMv3FeatureExtractor`]. The feature extractor is a required input. An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`): tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input. An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
""" """
feature_extractor_class = "LayoutLMv3FeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv3ImageProcessor"
tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast") tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__( def __call__(
self, self,
images, images,
...@@ -68,35 +88,34 @@ class LayoutLMv3Processor(ProcessorMixin): ...@@ -68,35 +88,34 @@ class LayoutLMv3Processor(ProcessorMixin):
**kwargs **kwargs
) -> BatchEncoding: ) -> BatchEncoding:
""" """
This method first forwards the `images` argument to [`~LayoutLMv3FeatureExtractor.__call__`]. In case This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
[`LayoutLMv3FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
together with resized and normalized `pixel_values`. In case [`LayoutLMv3FeatureExtractor`] was initialized together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
resized and normalized `pixel_values`. resized and normalized `pixel_values`.
Please refer to the docstring of the above two methods for more information. Please refer to the docstring of the above two methods for more information.
""" """
# verify input # verify input
if self.feature_extractor.apply_ocr and (boxes is not None): if self.image_processor.apply_ocr and (boxes is not None):
raise ValueError( raise ValueError(
"You cannot provide bounding boxes " "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
"if you initialized the feature extractor with apply_ocr set to True."
) )
if self.feature_extractor.apply_ocr and (word_labels is not None): if self.image_processor.apply_ocr and (word_labels is not None):
raise ValueError( raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True." "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
) )
# first, apply the feature extractor # first, apply the image processor
features = self.feature_extractor(images=images, return_tensors=return_tensors) features = self.image_processor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer # second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None: if text is not None and self.image_processor.apply_ocr and text_pair is None:
if isinstance(text, str): if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension) text = [text] # add batch dimension (as the image processor always adds a batch dimension)
text_pair = features["words"] text_pair = features["words"]
encoded_inputs = self.tokenizer( encoded_inputs = self.tokenizer(
...@@ -160,3 +179,19 @@ class LayoutLMv3Processor(ProcessorMixin): ...@@ -160,3 +179,19 @@ class LayoutLMv3Processor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
return ["input_ids", "bbox", "attention_mask", "pixel_values"] return ["input_ids", "bbox", "attention_mask", "pixel_values"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -81,10 +81,10 @@ def box_iou(boxes1, boxes2): ...@@ -81,10 +81,10 @@ def box_iou(boxes1, boxes2):
class OwlViTImageProcessor(BaseImageProcessor): class OwlViTImageProcessor(BaseImageProcessor):
r""" r"""
Constructs an OWL-ViT feature extractor. Constructs an OWL-ViT image processor.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users This image processor inherits from [`ImageProcessingMixin`] which contains most of the main methods. Users should
should refer to this superclass for more information regarding those methods. refer to this superclass for more information regarding those methods.
Args: Args:
do_resize (`bool`, *optional*, defaults to `True`): do_resize (`bool`, *optional*, defaults to `True`):
...@@ -115,7 +115,6 @@ class OwlViTImageProcessor(BaseImageProcessor): ...@@ -115,7 +115,6 @@ class OwlViTImageProcessor(BaseImageProcessor):
image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
The sequence of standard deviations for each channel, to be used when normalizing images. The sequence of standard deviations for each channel, to be used when normalizing images.
""" """
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
def __init__( def __init__(
...@@ -139,7 +138,7 @@ class OwlViTImageProcessor(BaseImageProcessor): ...@@ -139,7 +138,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
crop_size = get_size_dict(crop_size, default_to_square=True) crop_size = get_size_dict(crop_size, default_to_square=True)
# Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the # Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
# vision feature extractor method `rescale` as it would be set as an attribute during the super().__init__ # vision image processor method `rescale` as it would be set as an attribute during the super().__init__
# call. This is for backwards compatibility. # call. This is for backwards compatibility.
if "rescale" in kwargs: if "rescale" in kwargs:
rescale_val = kwargs.pop("rescale") rescale_val = kwargs.pop("rescale")
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
Image/Text processor class for OWL-ViT Image/Text processor class for OWL-ViT
""" """
import warnings
from typing import List from typing import List
import numpy as np import numpy as np
...@@ -28,29 +29,44 @@ from ...tokenization_utils_base import BatchEncoding ...@@ -28,29 +29,44 @@ from ...tokenization_utils_base import BatchEncoding
class OwlViTProcessor(ProcessorMixin): class OwlViTProcessor(ProcessorMixin):
r""" r"""
Constructs an OWL-ViT processor which wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
into a single processor that interits both the feature extractor and tokenizer functionalities. See the into a single processor that interits both the image processor and tokenizer functionalities. See the
[`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information. [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
Args: Args:
feature_extractor ([`OwlViTFeatureExtractor`]): image_processor ([`OwlViTImageProcessor`]):
The image processor is a required input. The image processor is a required input.
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]): tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
feature_extractor_class = "OwlViTFeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "OwlViTImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, *args, **kwargs): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(*args, **kwargs) if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs): def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
""" """
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
doctsring of the above two methods for more information. of the above two methods for more information.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`): text (`str`, `List[str]`, `List[List[str]]`):
...@@ -137,13 +153,13 @@ class OwlViTProcessor(ProcessorMixin): ...@@ -137,13 +153,13 @@ class OwlViTProcessor(ProcessorMixin):
if query_images is not None: if query_images is not None:
encoding = BatchEncoding() encoding = BatchEncoding()
query_pixel_values = self.feature_extractor( query_pixel_values = self.image_processor(
query_images, return_tensors=return_tensors, **kwargs query_images, return_tensors=return_tensors, **kwargs
).pixel_values ).pixel_values
encoding["query_pixel_values"] = query_pixel_values encoding["query_pixel_values"] = query_pixel_values
if images is not None: if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs) image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None: if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values encoding["pixel_values"] = image_features.pixel_values
...@@ -158,17 +174,17 @@ class OwlViTProcessor(ProcessorMixin): ...@@ -158,17 +174,17 @@ class OwlViTProcessor(ProcessorMixin):
def post_process(self, *args, **kwargs): def post_process(self, *args, **kwargs):
""" """
This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process`]. Please refer to the This method forwards all its arguments to [`OwlViTImageProcessor.post_process`]. Please refer to the docstring
docstring of this method for more information. of this method for more information.
""" """
return self.feature_extractor.post_process(*args, **kwargs) return self.image_processor.post_process(*args, **kwargs)
def post_process_image_guided_detection(self, *args, **kwargs): def post_process_image_guided_detection(self, *args, **kwargs):
""" """
This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process_one_shot_object_detection`]. This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
Please refer to the docstring of this method for more information. Please refer to the docstring of this method for more information.
""" """
return self.feature_extractor.post_process_image_guided_detection(*args, **kwargs) return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
...@@ -183,3 +199,19 @@ class OwlViTProcessor(ProcessorMixin): ...@@ -183,3 +199,19 @@ class OwlViTProcessor(ProcessorMixin):
the docstring of this method for more information. the docstring of this method for more information.
""" """
return self.tokenizer.decode(*args, **kwargs) return self.tokenizer.decode(*args, **kwargs)
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -42,7 +42,7 @@ class TrOCRProcessor(ProcessorMixin): ...@@ -42,7 +42,7 @@ class TrOCRProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, **kwargs): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs: if "feature_extractor" in kwargs:
warnings.warn( warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`" "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.", " instead.",
FutureWarning, FutureWarning,
) )
...@@ -124,8 +124,15 @@ class TrOCRProcessor(ProcessorMixin): ...@@ -124,8 +124,15 @@ class TrOCRProcessor(ProcessorMixin):
@property @property
def feature_extractor_class(self): def feature_extractor_class(self):
warnings.warn( warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`" "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
" instead.",
FutureWarning, FutureWarning,
) )
return self.image_processor_class return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
Processor class for ViLT. Processor class for ViLT.
""" """
import warnings
from typing import List, Optional, Union from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
...@@ -25,23 +26,38 @@ from ...utils import TensorType ...@@ -25,23 +26,38 @@ from ...utils import TensorType
class ViltProcessor(ProcessorMixin): class ViltProcessor(ProcessorMixin):
r""" r"""
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor. Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor.
[`ViltProcessor`] offers all the functionalities of [`ViltFeatureExtractor`] and [`BertTokenizerFast`]. See the [`ViltProcessor`] offers all the functionalities of [`ViltFeatureExtractor`] and [`BertTokenizerFast`]. See the
docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information. docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
Args: Args:
feature_extractor (`ViltFeatureExtractor`): image_processor (`ViltFeatureExtractor`):
An instance of [`ViltFeatureExtractor`]. The feature extractor is a required input. An instance of [`ViltFeatureExtractor`]. The image processor is a required input.
tokenizer (`BertTokenizerFast`): tokenizer (`BertTokenizerFast`):
An instance of ['BertTokenizerFast`]. The tokenizer is a required input. An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
""" """
feature_extractor_class = "ViltFeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "ViltFeatureExtractor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast") tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(feature_extractor, tokenizer) if "feature_extractor" in kwargs:
self.current_processor = self.feature_extractor warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__( def __call__(
self, self,
...@@ -88,8 +104,8 @@ class ViltProcessor(ProcessorMixin): ...@@ -88,8 +104,8 @@ class ViltProcessor(ProcessorMixin):
**kwargs, **kwargs,
) )
# add pixel_values + pixel_mask # add pixel_values + pixel_mask
encoding_feature_extractor = self.feature_extractor(images, return_tensors=return_tensors) encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
encoding.update(encoding_feature_extractor) encoding.update(encoding_image_processor)
return encoding return encoding
...@@ -110,5 +126,21 @@ class ViltProcessor(ProcessorMixin): ...@@ -110,5 +126,21 @@ class ViltProcessor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names)) return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -44,7 +44,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin): ...@@ -44,7 +44,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, **kwargs): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs: if "feature_extractor" in kwargs:
warnings.warn( warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`" "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.", " instead.",
FutureWarning, FutureWarning,
) )
...@@ -132,10 +132,18 @@ class VisionTextDualEncoderProcessor(ProcessorMixin): ...@@ -132,10 +132,18 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
image_processor_input_names = self.image_processor.model_input_names image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self): def feature_extractor_class(self):
warnings.warn( warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`" "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
" instead.",
FutureWarning, FutureWarning,
) )
return self.image_processor_class return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -15,38 +15,55 @@ ...@@ -15,38 +15,55 @@
""" """
Image/Text processor class for XCLIP Image/Text processor class for XCLIP
""" """
import warnings
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
class XCLIPProcessor(ProcessorMixin): class XCLIPProcessor(ProcessorMixin):
r""" r"""
Constructs an X-CLIP processor which wraps a VideoMAE feature extractor and a CLIP tokenizer into a single Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
processor.
[`XCLIPProcessor`] offers all the functionalities of [`VideoMAEFeatureExtractor`] and [`CLIPTokenizerFast`]. See [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
the [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information. [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
Args: Args:
feature_extractor ([`VideoMAEFeatureExtractor`]): image_processor ([`VideoMAEImageProcessor`]):
The feature extractor is a required input. The image processor is a required input.
tokenizer ([`CLIPTokenizerFast`]): tokenizer ([`CLIPTokenizerFast`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
feature_extractor_class = "VideoMAEFeatureExtractor" attributes = ["image_processor", "tokenizer"]
image_processor_class = "VideoMAEImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, feature_extractor, tokenizer): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(feature_extractor, tokenizer) if "feature_extractor" in kwargs:
self.current_processor = self.feature_extractor warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs): def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
""" """
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
VideoMAEFeatureExtractor's [`~VideoMAEFeatureExtractor.__call__`] if `videos` is not `None`. Please refer to VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
the doctsring of the above two methods for more information. doctsring of the above two methods for more information.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`): text (`str`, `List[str]`, `List[List[str]]`):
...@@ -84,7 +101,7 @@ class XCLIPProcessor(ProcessorMixin): ...@@ -84,7 +101,7 @@ class XCLIPProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if videos is not None: if videos is not None:
image_features = self.feature_extractor(videos, return_tensors=return_tensors, **kwargs) image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
if text is not None and videos is not None: if text is not None and videos is not None:
encoding["pixel_values"] = image_features.pixel_values encoding["pixel_values"] = image_features.pixel_values
...@@ -111,3 +128,19 @@ class XCLIPProcessor(ProcessorMixin): ...@@ -111,3 +128,19 @@ class XCLIPProcessor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
return ["input_ids", "attention_mask", "position_ids", "pixel_values"] return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -158,7 +158,8 @@ class ProcessorMixin(PushToHubMixin): ...@@ -158,7 +158,8 @@ class ProcessorMixin(PushToHubMixin):
<Tip> <Tip>
This class method is simply calling the feature extractor This class method is simply calling the feature extractor
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
[`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
methods above for more information. methods above for more information.
......
...@@ -30,7 +30,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available ...@@ -30,7 +30,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ChineseCLIPFeatureExtractor, ChineseCLIPProcessor from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor
@require_vision @require_vision
...@@ -62,7 +62,7 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -62,7 +62,7 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
feature_extractor_map = { image_processor_map = {
"do_resize": True, "do_resize": True,
"size": {"height": 224, "width": 224}, "size": {"height": 224, "width": 224},
"do_center_crop": True, "do_center_crop": True,
...@@ -72,9 +72,9 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -72,9 +72,9 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
"image_std": [0.26862954, 0.26130258, 0.27577711], "image_std": [0.26862954, 0.26130258, 0.27577711],
"do_convert_rgb": True, "do_convert_rgb": True,
} }
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) self.image_processor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp: with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp) json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
...@@ -82,8 +82,8 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -82,8 +82,8 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs):
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs): def get_image_processor(self, **kwargs):
return ChineseCLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) return ChineseCLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmpdirname) shutil.rmtree(self.tmpdirname)
...@@ -102,13 +102,13 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -102,13 +102,13 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self): def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer() tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer() tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor) processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname) processor_slow.save_pretrained(self.tmpdirname)
processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False) processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor) processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname) processor_fast.save_pretrained(self.tmpdirname)
processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname) processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname)
...@@ -118,19 +118,17 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -118,19 +118,17 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, ChineseCLIPFeatureExtractor) self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor)
self.assertIsInstance(processor_fast.feature_extractor, ChineseCLIPFeatureExtractor) self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor)
def test_save_load_pretrained_additional_features(self): def test_save_load_pretrained_additional_features(self):
processor = ChineseCLIPProcessor( processor = ChineseCLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
)
processor.save_pretrained(self.tmpdirname) processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)") tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False) image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
processor = ChineseCLIPProcessor.from_pretrained( processor = ChineseCLIPProcessor.from_pretrained(
self.tmpdirname, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False self.tmpdirname, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False
...@@ -139,28 +137,28 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -139,28 +137,28 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, BertTokenizerFast) self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, ChineseCLIPFeatureExtractor) self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor)
def test_feature_extractor(self): def test_image_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np") input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np") input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys(): for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self): def test_tokenizer(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "Alexandra,T-shirt的价格是15便士。" input_str = "Alexandra,T-shirt的价格是15便士。"
...@@ -172,10 +170,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -172,10 +170,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key]) self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self): def test_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "Alexandra,T-shirt的价格是15便士。" input_str = "Alexandra,T-shirt的价格是15便士。"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
...@@ -189,10 +187,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -189,10 +187,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
processor() processor()
def test_tokenizer_decode(self): def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
...@@ -202,10 +200,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase): ...@@ -202,10 +200,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor) self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self): def test_model_input_names(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "Alexandra,T-shirt的价格是15便士。" input_str = "Alexandra,T-shirt的价格是15便士。"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
......
...@@ -24,13 +24,13 @@ import pytest ...@@ -24,13 +24,13 @@ import pytest
from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision from transformers.testing_utils import require_vision
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import CLIPFeatureExtractor, CLIPProcessor from transformers import CLIPImageProcessor, CLIPProcessor
@require_vision @require_vision
...@@ -52,7 +52,7 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -52,7 +52,7 @@ class CLIPProcessorTest(unittest.TestCase):
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
feature_extractor_map = { image_processor_map = {
"do_resize": True, "do_resize": True,
"size": 20, "size": 20,
"do_center_crop": True, "do_center_crop": True,
...@@ -61,9 +61,9 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -61,9 +61,9 @@ class CLIPProcessorTest(unittest.TestCase):
"image_mean": [0.48145466, 0.4578275, 0.40821073], "image_mean": [0.48145466, 0.4578275, 0.40821073],
"image_std": [0.26862954, 0.26130258, 0.27577711], "image_std": [0.26862954, 0.26130258, 0.27577711],
} }
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp: with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp) json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs) return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
...@@ -71,8 +71,8 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -71,8 +71,8 @@ class CLIPProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs):
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs): def get_image_processor(self, **kwargs):
return CLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) return CLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmpdirname) shutil.rmtree(self.tmpdirname)
...@@ -91,13 +91,13 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -91,13 +91,13 @@ class CLIPProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self): def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer() tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer() tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor) processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname) processor_slow.save_pretrained(self.tmpdirname)
processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False) processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor) processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname) processor_fast.save_pretrained(self.tmpdirname)
processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname) processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname)
...@@ -107,17 +107,17 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -107,17 +107,17 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, CLIPFeatureExtractor) self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor)
self.assertIsInstance(processor_fast.feature_extractor, CLIPFeatureExtractor) self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor)
def test_save_load_pretrained_additional_features(self): def test_save_load_pretrained_additional_features(self):
processor = CLIPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname) processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0) image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = CLIPProcessor.from_pretrained( processor = CLIPProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
...@@ -126,28 +126,28 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -126,28 +126,28 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor) self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
def test_feature_extractor(self): def test_image_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np") input_image_proc = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np") input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys(): for key in input_image_proc.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self): def test_tokenizer(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
...@@ -159,10 +159,10 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -159,10 +159,10 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key]) self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self): def test_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
...@@ -176,10 +176,10 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -176,10 +176,10 @@ class CLIPProcessorTest(unittest.TestCase):
processor() processor()
def test_tokenizer_decode(self): def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
...@@ -189,10 +189,10 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -189,10 +189,10 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor) self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self): def test_model_input_names(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
......
...@@ -24,13 +24,13 @@ import pytest ...@@ -24,13 +24,13 @@ import pytest
from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision from transformers.testing_utils import require_vision
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import CLIPSegProcessor, ViTFeatureExtractor from transformers import CLIPSegProcessor, ViTImageProcessor
@require_vision @require_vision
...@@ -52,7 +52,7 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -52,7 +52,7 @@ class CLIPSegProcessorTest(unittest.TestCase):
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
feature_extractor_map = { image_processor_map = {
"do_resize": True, "do_resize": True,
"size": 20, "size": 20,
"do_center_crop": True, "do_center_crop": True,
...@@ -61,9 +61,9 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -61,9 +61,9 @@ class CLIPSegProcessorTest(unittest.TestCase):
"image_mean": [0.48145466, 0.4578275, 0.40821073], "image_mean": [0.48145466, 0.4578275, 0.40821073],
"image_std": [0.26862954, 0.26130258, 0.27577711], "image_std": [0.26862954, 0.26130258, 0.27577711],
} }
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp: with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp) json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs) return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
...@@ -71,8 +71,8 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -71,8 +71,8 @@ class CLIPSegProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs):
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs): def get_image_processor(self, **kwargs):
return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmpdirname) shutil.rmtree(self.tmpdirname)
...@@ -90,13 +90,13 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -90,13 +90,13 @@ class CLIPSegProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self): def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer() tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer() tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor) processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname) processor_slow.save_pretrained(self.tmpdirname)
processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False) processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor) processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname) processor_fast.save_pretrained(self.tmpdirname)
processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname) processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
...@@ -106,17 +106,17 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -106,17 +106,17 @@ class CLIPSegProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, ViTFeatureExtractor) self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor)
self.assertIsInstance(processor_fast.feature_extractor, ViTFeatureExtractor) self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor)
def test_save_load_pretrained_additional_features(self): def test_save_load_pretrained_additional_features(self):
processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname) processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0) image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = CLIPSegProcessor.from_pretrained( processor = CLIPSegProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
...@@ -125,28 +125,28 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -125,28 +125,28 @@ class CLIPSegProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, ViTFeatureExtractor) self.assertIsInstance(processor.image_processor, ViTImageProcessor)
def test_feature_extractor(self): def test_image_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np") input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np") input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys(): for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self): def test_tokenizer(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
...@@ -158,10 +158,10 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -158,10 +158,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key]) self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self): def test_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
...@@ -175,10 +175,10 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -175,10 +175,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
processor() processor()
def test_tokenizer_decode(self): def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
......
...@@ -25,13 +25,13 @@ import pytest ...@@ -25,13 +25,13 @@ import pytest
from transformers import BertTokenizer, BertTokenizerFast from transformers import BertTokenizer, BertTokenizerFast
from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision from transformers.testing_utils import require_vision
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import FlavaFeatureExtractor, FlavaProcessor from transformers import FlavaImageProcessor, FlavaProcessor
from transformers.models.flava.image_processing_flava import ( from transformers.models.flava.image_processing_flava import (
FLAVA_CODEBOOK_MEAN, FLAVA_CODEBOOK_MEAN,
FLAVA_CODEBOOK_STD, FLAVA_CODEBOOK_STD,
...@@ -53,7 +53,7 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -53,7 +53,7 @@ class FlavaProcessorTest(unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write("".join([x + "\n" for x in vocab_tokens])) fp.write("".join([x + "\n" for x in vocab_tokens]))
feature_extractor_map = { image_processor_map = {
"image_mean": FLAVA_IMAGE_MEAN, "image_mean": FLAVA_IMAGE_MEAN,
"image_std": FLAVA_IMAGE_STD, "image_std": FLAVA_IMAGE_STD,
"do_normalize": True, "do_normalize": True,
...@@ -77,9 +77,9 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -77,9 +77,9 @@ class FlavaProcessorTest(unittest.TestCase):
"codebook_image_std": FLAVA_CODEBOOK_STD, "codebook_image_std": FLAVA_CODEBOOK_STD,
} }
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp: with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp) json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
...@@ -87,8 +87,8 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -87,8 +87,8 @@ class FlavaProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs):
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs): def get_image_processor(self, **kwargs):
return FlavaFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmpdirname) shutil.rmtree(self.tmpdirname)
...@@ -107,13 +107,13 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -107,13 +107,13 @@ class FlavaProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self): def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer() tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer() tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor) processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname) processor_slow.save_pretrained(self.tmpdirname)
processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False) processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor) processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname) processor_fast.save_pretrained(self.tmpdirname)
processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname) processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname)
...@@ -123,17 +123,17 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -123,17 +123,17 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, FlavaFeatureExtractor) self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor)
self.assertIsInstance(processor_fast.feature_extractor, FlavaFeatureExtractor) self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor)
def test_save_load_pretrained_additional_features(self): def test_save_load_pretrained_additional_features(self):
processor = FlavaProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname) processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0) image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = FlavaProcessor.from_pretrained( processor = FlavaProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
...@@ -142,18 +142,18 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -142,18 +142,18 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, BertTokenizerFast) self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, FlavaFeatureExtractor) self.assertIsInstance(processor.image_processor, FlavaImageProcessor)
def test_feature_extractor(self): def test_image_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np") input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np") input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys(): for key in input_feat_extract.keys():
...@@ -161,7 +161,7 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -161,7 +161,7 @@ class FlavaProcessorTest(unittest.TestCase):
# With rest of the args # With rest of the args
random.seed(1234) random.seed(1234)
input_feat_extract = feature_extractor( input_feat_extract = image_processor(
image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np" image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
) )
random.seed(1234) random.seed(1234)
...@@ -173,10 +173,10 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -173,10 +173,10 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self): def test_tokenizer(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
...@@ -188,10 +188,10 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -188,10 +188,10 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key]) self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self): def test_processor(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
...@@ -220,10 +220,10 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -220,10 +220,10 @@ class FlavaProcessorTest(unittest.TestCase):
processor() processor()
def test_tokenizer_decode(self): def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
...@@ -233,10 +233,10 @@ class FlavaProcessorTest(unittest.TestCase): ...@@ -233,10 +233,10 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor) self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self): def test_model_input_names(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
......
...@@ -31,7 +31,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes ...@@ -31,7 +31,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes
if is_pytesseract_available(): if is_pytesseract_available():
from PIL import Image from PIL import Image
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2Processor from transformers import LayoutLMv2ImageProcessor, LayoutLMv2Processor
@require_pytesseract @require_pytesseract
...@@ -59,7 +59,7 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): ...@@ -59,7 +59,7 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
"lowest", "lowest",
] ]
feature_extractor_map = { image_processor_map = {
"do_resize": True, "do_resize": True,
"size": 224, "size": 224,
"apply_ocr": True, "apply_ocr": True,
...@@ -69,9 +69,9 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): ...@@ -69,9 +69,9 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) self.image_processing_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp: with open(self.image_processing_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n") fp.write(json.dumps(image_processor_map) + "\n")
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
...@@ -82,8 +82,8 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): ...@@ -82,8 +82,8 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]: def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)] return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
def get_feature_extractor(self, **kwargs): def get_image_processor(self, **kwargs):
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmpdirname) shutil.rmtree(self.tmpdirname)
...@@ -100,10 +100,10 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): ...@@ -100,10 +100,10 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
return image_inputs return image_inputs
def test_save_load_pretrained_default(self): def test_save_load_pretrained_default(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
for tokenizer in tokenizers: for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname) processor.save_pretrained(self.tmpdirname)
processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname) processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)
...@@ -111,16 +111,16 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): ...@@ -111,16 +111,16 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)) self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
def test_save_load_pretrained_additional_features(self): def test_save_load_pretrained_additional_features(self):
processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer()) processor = LayoutLMv2Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
processor.save_pretrained(self.tmpdirname) processor.save_pretrained(self.tmpdirname)
# slow tokenizer # slow tokenizer
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30) image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained( processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
...@@ -129,12 +129,12 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): ...@@ -129,12 +129,12 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer) self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
# fast tokenizer # fast tokenizer
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)") tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30) image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained( processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
...@@ -143,14 +143,14 @@ class LayoutLMv2ProcessorTest(unittest.TestCase): ...@@ -143,14 +143,14 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast) self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
def test_model_input_names(self): def test_model_input_names(self):
feature_extractor = self.get_feature_extractor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = LayoutLMv2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = LayoutLMv2Processor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
...@@ -220,15 +220,15 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -220,15 +220,15 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_1(self): def test_processor_case_1(self):
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
feature_extractor = LayoutLMv2FeatureExtractor() image_processor = LayoutLMv2ImageProcessor()
tokenizers = self.get_tokenizers tokenizers = self.get_tokenizers
images = self.get_images images = self.get_images
for tokenizer in tokenizers: for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched # not batched
input_feat_extract = feature_extractor(images[0], return_tensors="pt") input_image_proc = image_processor(images[0], return_tensors="pt")
input_processor = processor(images[0], return_tensors="pt") input_processor = processor(images[0], return_tensors="pt")
# verify keys # verify keys
...@@ -237,9 +237,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -237,9 +237,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertListEqual(actual_keys, expected_keys) self.assertListEqual(actual_keys, expected_keys)
# verify image # verify image
self.assertAlmostEqual( self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids # verify input_ids
# this was obtained with Tesseract 4.1.1 # this was obtained with Tesseract 4.1.1
...@@ -250,7 +248,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -250,7 +248,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertSequenceEqual(decoding, expected_decoding) self.assertSequenceEqual(decoding, expected_decoding)
# batched # batched
input_feat_extract = feature_extractor(images, return_tensors="pt") input_image_proc = image_processor(images, return_tensors="pt")
input_processor = processor(images, padding=True, return_tensors="pt") input_processor = processor(images, padding=True, return_tensors="pt")
# verify keys # verify keys
...@@ -259,9 +257,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -259,9 +257,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertListEqual(actual_keys, expected_keys) self.assertListEqual(actual_keys, expected_keys)
# verify images # verify images
self.assertAlmostEqual( self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids # verify input_ids
# this was obtained with Tesseract 4.1.1 # this was obtained with Tesseract 4.1.1
...@@ -275,12 +271,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -275,12 +271,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_2(self): def test_processor_case_2(self):
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers tokenizers = self.get_tokenizers
images = self.get_images images = self.get_images
for tokenizer in tokenizers: for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched # not batched
words = ["hello", "world"] words = ["hello", "world"]
...@@ -329,12 +325,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -329,12 +325,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_3(self): def test_processor_case_3(self):
# case 3: token classification (training), apply_ocr=False # case 3: token classification (training), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers tokenizers = self.get_tokenizers
images = self.get_images images = self.get_images
for tokenizer in tokenizers: for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched # not batched
words = ["weirdly", "world"] words = ["weirdly", "world"]
...@@ -394,12 +390,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -394,12 +390,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_4(self): def test_processor_case_4(self):
# case 4: visual question answering (inference), apply_ocr=True # case 4: visual question answering (inference), apply_ocr=True
feature_extractor = LayoutLMv2FeatureExtractor() image_processor = LayoutLMv2ImageProcessor()
tokenizers = self.get_tokenizers tokenizers = self.get_tokenizers
images = self.get_images images = self.get_images
for tokenizer in tokenizers: for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched # not batched
question = "What's his name?" question = "What's his name?"
...@@ -445,12 +441,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase): ...@@ -445,12 +441,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_5(self): def test_processor_case_5(self):
# case 5: visual question answering (inference), apply_ocr=False # case 5: visual question answering (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers tokenizers = self.get_tokenizers
images = self.get_images images = self.get_images
for tokenizer in tokenizers: for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched # not batched
question = "What's his name?" question = "What's his name?"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment