Unverified Commit a95fd354 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Vision processors - replace FE with IPs (#20590)



* Replace FE references with IPs

* Update processor tests

* Update src/transformers/models/clip/processing_clip.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/clip/processing_clip.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update warning messages v4.27 -> v5

* Fixup

* Update Chinese CLIP processor

* Add feature_extractor property

* Add attributes

* Add tests
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 704027f0
......@@ -287,8 +287,8 @@ class AutoProcessor:
raise ValueError(
f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a "
"tokenizer or a feature extractor for this model. Make sure the repository contains the files of at least "
"one of those processing classes."
"tokenizer, an image processor or a feature extractor for this model. Make sure the repository contains"
"the files of at least one of those processing classes."
)
@staticmethod
......
......@@ -15,39 +15,56 @@
"""
Image/Text processor class for Chinese-CLIP
"""
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class ChineseCLIPProcessor(ProcessorMixin):
r"""
Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP feature extractor and a Chinese-CLIP tokenizer into
a single processor.
Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
single processor.
[`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPFeatureExtractor`] and
[`BertTokenizerFast`]. See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more
information.
[`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`].
See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
Args:
feature_extractor ([`ChineseCLIPFeatureExtractor`]):
The feature extractor is a required input.
image_processor ([`ChineseCLIPImageProcessor`]):
The image processor is a required input.
tokenizer ([`BertTokenizerFast`]):
The tokenizer is a required input.
"""
feature_extractor_class = "ChineseCLIPFeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "ChineseCLIPImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
doctsring of the above two methods for more information.
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
......@@ -84,7 +101,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
......@@ -111,5 +128,13 @@ class ChineseCLIPProcessor(ProcessorMixin):
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
......@@ -15,37 +15,54 @@
"""
Image/Text processor class for CLIP
"""
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class CLIPProcessor(ProcessorMixin):
r"""
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
[`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
[`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
[`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
Args:
feature_extractor ([`CLIPFeatureExtractor`]):
The feature extractor is a required input.
image_processor ([`CLIPImageProcessor`]):
The image processor is a required input.
tokenizer ([`CLIPTokenizerFast`]):
The tokenizer is a required input.
"""
feature_extractor_class = "CLIPFeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
doctsring of the above two methods for more information.
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
......@@ -82,7 +99,7 @@ class CLIPProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
......@@ -109,5 +126,21 @@ class CLIPProcessor(ProcessorMixin):
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -15,38 +15,54 @@
"""
Image/Text processor class for CLIPSeg
"""
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class CLIPSegProcessor(ProcessorMixin):
r"""
Constructs a CLIPSeg processor which wraps a CLIPSeg feature extractor and a CLIP tokenizer into a single
processor.
Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor.
[`CLIPSegProcessor`] offers all the functionalities of [`ViTFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
[`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the
[`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
Args:
feature_extractor ([`ViTFeatureExtractor`]):
The feature extractor is a required input.
image_processor ([`ViTImageProcessor`]):
The image processor is a required input.
tokenizer ([`CLIPTokenizerFast`]):
The tokenizer is a required input.
"""
feature_extractor_class = "ViTFeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "ViTImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
ViTFeatureExtractor's [`~ViTFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
doctsring of the above two methods for more information.
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
......@@ -83,7 +99,7 @@ class CLIPSegProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
......@@ -106,3 +122,19 @@ class CLIPSegProcessor(ProcessorMixin):
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -24,7 +24,7 @@ from ...processing_utils import ProcessorMixin
class DonutProcessor(ProcessorMixin):
r"""
Constructs a Donut processor which wraps a Donut feature extractor and an XLMRoBERTa tokenizer into a single
Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
processor.
[`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and
......@@ -32,8 +32,8 @@ class DonutProcessor(ProcessorMixin):
[`~DonutProcessor.decode`] for more information.
Args:
feature_extractor ([`DonutFeatureExtractor`]):
An instance of [`DonutFeatureExtractor`]. The feature extractor is a required input.
image_processor ([`DonutFeatureExtractor`]):
An instance of [`DonutFeatureExtractor`]. The image processor is a required input.
tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
"""
......@@ -44,7 +44,7 @@ class DonutProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
......@@ -176,8 +176,15 @@ class DonutProcessor(ProcessorMixin):
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
" instead.",
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -15,6 +15,8 @@
"""
Image/Text processor class for FLAVA
"""
import warnings
from typing import List, Optional, Union
from ...image_utils import ImageInput
......@@ -25,21 +27,36 @@ from ...utils import TensorType
class FlavaProcessor(ProcessorMixin):
r"""
Constructs a FLAVA processor which wraps a FLAVA feature extractor and a FLAVA tokenizer into a single processor.
Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
[`FlavaProcessor`] offers all the functionalities of [`FlavaFeatureExtractor`] and [`BertTokenizerFast`]. See the
[`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
Args:
feature_extractor ([`FlavaFeatureExtractor`]): The feature extractor is a required input.
image_processor ([`FlavaFeatureExtractor`]): The image processor is a required input.
tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
"""
feature_extractor_class = "FlavaFeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "FlavaFeatureExtractor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
......@@ -93,7 +110,7 @@ class FlavaProcessor(ProcessorMixin):
**kwargs,
)
if images is not None:
image_features = self.feature_extractor(
image_features = self.image_processor(
images,
return_image_mask=return_image_mask,
return_codebook_pixels=return_codebook_pixels,
......@@ -126,5 +143,21 @@ class FlavaProcessor(ProcessorMixin):
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -15,6 +15,8 @@
"""
Processor class for LayoutLMv2.
"""
import warnings
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
......@@ -24,26 +26,44 @@ from ...utils import TensorType
class LayoutLMv2Processor(ProcessorMixin):
r"""
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
single processor.
[`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
to get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
[`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
feature_extractor (`LayoutLMv2FeatureExtractor`):
An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
image_processor (`LayoutLMv2ImageProcessor`):
An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
"""
feature_extractor_class = "LayoutLMv2FeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv2ImageProcessor"
tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(
self,
images,
......@@ -68,37 +88,36 @@ class LayoutLMv2Processor(ProcessorMixin):
**kwargs
) -> BatchEncoding:
"""
This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
[`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
[`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
Please refer to the docstring of the above two methods for more information.
"""
# verify input
if self.feature_extractor.apply_ocr and (boxes is not None):
if self.image_processor.apply_ocr and (boxes is not None):
raise ValueError(
"You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True."
"You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
)
if self.feature_extractor.apply_ocr and (word_labels is not None):
if self.image_processor.apply_ocr and (word_labels is not None):
raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
"You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
)
if return_overflowing_tokens is True and return_offsets_mapping is False:
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
# first, apply the image processor
features = self.image_processor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
if text is not None and self.image_processor.apply_ocr and text_pair is None:
if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
text = [text] # add batch dimension (as the image processor always adds a batch dimension)
text_pair = features["words"]
encoded_inputs = self.tokenizer(
......@@ -162,3 +181,19 @@ class LayoutLMv2Processor(ProcessorMixin):
@property
def model_input_names(self):
return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -15,6 +15,8 @@
"""
Processor class for LayoutLMv3.
"""
import warnings
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
......@@ -24,26 +26,44 @@ from ...utils import TensorType
class LayoutLMv3Processor(ProcessorMixin):
r"""
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 feature extractor and a LayoutLMv3 tokenizer into a
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
single processor.
[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv3FeatureExtractor`] to resize and normalize document images, and optionally applies OCR to
It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
feature_extractor (`LayoutLMv3FeatureExtractor`):
An instance of [`LayoutLMv3FeatureExtractor`]. The feature extractor is a required input.
image_processor (`LayoutLMv3ImageProcessor`):
An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
"""
feature_extractor_class = "LayoutLMv3FeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv3ImageProcessor"
tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(
self,
images,
......@@ -68,35 +88,34 @@ class LayoutLMv3Processor(ProcessorMixin):
**kwargs
) -> BatchEncoding:
"""
This method first forwards the `images` argument to [`~LayoutLMv3FeatureExtractor.__call__`]. In case
[`LayoutLMv3FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
[`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
together with resized and normalized `pixel_values`. In case [`LayoutLMv3FeatureExtractor`] was initialized
with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user
along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
`apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
resized and normalized `pixel_values`.
Please refer to the docstring of the above two methods for more information.
"""
# verify input
if self.feature_extractor.apply_ocr and (boxes is not None):
if self.image_processor.apply_ocr and (boxes is not None):
raise ValueError(
"You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True."
"You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
)
if self.feature_extractor.apply_ocr and (word_labels is not None):
if self.image_processor.apply_ocr and (word_labels is not None):
raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
"You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
)
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
# first, apply the image processor
features = self.image_processor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
if text is not None and self.image_processor.apply_ocr and text_pair is None:
if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
text = [text] # add batch dimension (as the image processor always adds a batch dimension)
text_pair = features["words"]
encoded_inputs = self.tokenizer(
......@@ -160,3 +179,19 @@ class LayoutLMv3Processor(ProcessorMixin):
@property
def model_input_names(self):
return ["input_ids", "bbox", "attention_mask", "pixel_values"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -81,10 +81,10 @@ def box_iou(boxes1, boxes2):
class OwlViTImageProcessor(BaseImageProcessor):
r"""
Constructs an OWL-ViT feature extractor.
Constructs an OWL-ViT image processor.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
This image processor inherits from [`ImageProcessingMixin`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
......@@ -115,7 +115,6 @@ class OwlViTImageProcessor(BaseImageProcessor):
image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
"""
model_input_names = ["pixel_values"]
def __init__(
......@@ -139,7 +138,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
crop_size = get_size_dict(crop_size, default_to_square=True)
# Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
# vision feature extractor method `rescale` as it would be set as an attribute during the super().__init__
# vision image processor method `rescale` as it would be set as an attribute during the super().__init__
# call. This is for backwards compatibility.
if "rescale" in kwargs:
rescale_val = kwargs.pop("rescale")
......
......@@ -16,6 +16,7 @@
Image/Text processor class for OWL-ViT
"""
import warnings
from typing import List
import numpy as np
......@@ -28,29 +29,44 @@ from ...tokenization_utils_base import BatchEncoding
class OwlViTProcessor(ProcessorMixin):
r"""
Constructs an OWL-ViT processor which wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
into a single processor that interits both the feature extractor and tokenizer functionalities. See the
Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
into a single processor that interits both the image processor and tokenizer functionalities. See the
[`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
Args:
feature_extractor ([`OwlViTFeatureExtractor`]):
image_processor ([`OwlViTImageProcessor`]):
The image processor is a required input.
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
The tokenizer is a required input.
"""
feature_extractor_class = "OwlViTFeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "OwlViTImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
"""
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
doctsring of the above two methods for more information.
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
......@@ -137,13 +153,13 @@ class OwlViTProcessor(ProcessorMixin):
if query_images is not None:
encoding = BatchEncoding()
query_pixel_values = self.feature_extractor(
query_pixel_values = self.image_processor(
query_images, return_tensors=return_tensors, **kwargs
).pixel_values
encoding["query_pixel_values"] = query_pixel_values
if images is not None:
image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs)
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
......@@ -158,17 +174,17 @@ class OwlViTProcessor(ProcessorMixin):
def post_process(self, *args, **kwargs):
"""
This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process`]. Please refer to the
docstring of this method for more information.
This method forwards all its arguments to [`OwlViTImageProcessor.post_process`]. Please refer to the docstring
of this method for more information.
"""
return self.feature_extractor.post_process(*args, **kwargs)
return self.image_processor.post_process(*args, **kwargs)
def post_process_image_guided_detection(self, *args, **kwargs):
"""
This method forwards all its arguments to [`OwlViTFeatureExtractor.post_process_one_shot_object_detection`].
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
Please refer to the docstring of this method for more information.
"""
return self.feature_extractor.post_process_image_guided_detection(*args, **kwargs)
return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
def batch_decode(self, *args, **kwargs):
"""
......@@ -183,3 +199,19 @@ class OwlViTProcessor(ProcessorMixin):
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -42,7 +42,7 @@ class TrOCRProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
......@@ -124,8 +124,15 @@ class TrOCRProcessor(ProcessorMixin):
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
" instead.",
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -16,6 +16,7 @@
Processor class for ViLT.
"""
import warnings
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
......@@ -25,23 +26,38 @@ from ...utils import TensorType
class ViltProcessor(ProcessorMixin):
r"""
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor.
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor.
[`ViltProcessor`] offers all the functionalities of [`ViltFeatureExtractor`] and [`BertTokenizerFast`]. See the
docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
Args:
feature_extractor (`ViltFeatureExtractor`):
An instance of [`ViltFeatureExtractor`]. The feature extractor is a required input.
image_processor (`ViltFeatureExtractor`):
An instance of [`ViltFeatureExtractor`]. The image processor is a required input.
tokenizer (`BertTokenizerFast`):
An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
"""
feature_extractor_class = "ViltFeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "ViltFeatureExtractor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(
self,
......@@ -88,8 +104,8 @@ class ViltProcessor(ProcessorMixin):
**kwargs,
)
# add pixel_values + pixel_mask
encoding_feature_extractor = self.feature_extractor(images, return_tensors=return_tensors)
encoding.update(encoding_feature_extractor)
encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
encoding.update(encoding_image_processor)
return encoding
......@@ -110,5 +126,21 @@ class ViltProcessor(ProcessorMixin):
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -44,7 +44,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v4.27, use `image_processor`"
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
......@@ -132,10 +132,18 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v4.27. Use `image_processor_class`"
" instead.",
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -15,38 +15,55 @@
"""
Image/Text processor class for XCLIP
"""
import warnings
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
class XCLIPProcessor(ProcessorMixin):
r"""
Constructs an X-CLIP processor which wraps a VideoMAE feature extractor and a CLIP tokenizer into a single
processor.
Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
[`XCLIPProcessor`] offers all the functionalities of [`VideoMAEFeatureExtractor`] and [`CLIPTokenizerFast`]. See
the [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
[`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
[`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
Args:
feature_extractor ([`VideoMAEFeatureExtractor`]):
The feature extractor is a required input.
image_processor ([`VideoMAEImageProcessor`]):
The image processor is a required input.
tokenizer ([`CLIPTokenizerFast`]):
The tokenizer is a required input.
"""
feature_extractor_class = "VideoMAEFeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "VideoMAEImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
VideoMAEFeatureExtractor's [`~VideoMAEFeatureExtractor.__call__`] if `videos` is not `None`. Please refer to
the doctsring of the above two methods for more information.
VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
doctsring of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
......@@ -84,7 +101,7 @@ class XCLIPProcessor(ProcessorMixin):
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
if videos is not None:
image_features = self.feature_extractor(videos, return_tensors=return_tensors, **kwargs)
image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
if text is not None and videos is not None:
encoding["pixel_values"] = image_features.pixel_values
......@@ -111,3 +128,19 @@ class XCLIPProcessor(ProcessorMixin):
@property
def model_input_names(self):
return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -158,7 +158,8 @@ class ProcessorMixin(PushToHubMixin):
<Tip>
This class method is simply calling the feature extractor
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
[`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
methods above for more information.
......
......@@ -30,7 +30,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
if is_vision_available():
from PIL import Image
from transformers import ChineseCLIPFeatureExtractor, ChineseCLIPProcessor
from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor
@require_vision
......@@ -62,7 +62,7 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
feature_extractor_map = {
image_processor_map = {
"do_resize": True,
"size": {"height": 224, "width": 224},
"do_center_crop": True,
......@@ -72,9 +72,9 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
"image_std": [0.26862954, 0.26130258, 0.27577711],
"do_convert_rgb": True,
}
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp)
self.image_processor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs):
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
......@@ -82,8 +82,8 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs):
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs):
return ChineseCLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def get_image_processor(self, **kwargs):
return ChineseCLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
......@@ -102,13 +102,13 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname)
processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname)
processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname)
......@@ -118,19 +118,17 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, ChineseCLIPFeatureExtractor)
self.assertIsInstance(processor_fast.feature_extractor, ChineseCLIPFeatureExtractor)
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor)
self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor)
def test_save_load_pretrained_additional_features(self):
processor = ChineseCLIPProcessor(
tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
)
processor = ChineseCLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False)
image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
processor = ChineseCLIPProcessor.from_pretrained(
self.tmpdirname, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False
......@@ -139,28 +137,28 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, ChineseCLIPFeatureExtractor)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np")
input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "Alexandra,T-shirt的价格是15便士。"
......@@ -172,10 +170,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "Alexandra,T-shirt的价格是15便士。"
image_input = self.prepare_image_inputs()
......@@ -189,10 +187,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
processor()
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
......@@ -202,10 +200,10 @@ class ChineseCLIPProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = ChineseCLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "Alexandra,T-shirt的价格是15便士。"
image_input = self.prepare_image_inputs()
......
......@@ -24,13 +24,13 @@ import pytest
from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
if is_vision_available():
from PIL import Image
from transformers import CLIPFeatureExtractor, CLIPProcessor
from transformers import CLIPImageProcessor, CLIPProcessor
@require_vision
......@@ -52,7 +52,7 @@ class CLIPProcessorTest(unittest.TestCase):
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
feature_extractor_map = {
image_processor_map = {
"do_resize": True,
"size": 20,
"do_center_crop": True,
......@@ -61,9 +61,9 @@ class CLIPProcessorTest(unittest.TestCase):
"image_mean": [0.48145466, 0.4578275, 0.40821073],
"image_std": [0.26862954, 0.26130258, 0.27577711],
}
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp)
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs):
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
......@@ -71,8 +71,8 @@ class CLIPProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs):
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs):
return CLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def get_image_processor(self, **kwargs):
return CLIPImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
......@@ -91,13 +91,13 @@ class CLIPProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname)
processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname)
processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname)
......@@ -107,17 +107,17 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, CLIPFeatureExtractor)
self.assertIsInstance(processor_fast.feature_extractor, CLIPFeatureExtractor)
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor)
self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor)
def test_save_load_pretrained_additional_features(self):
processor = CLIPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = CLIPProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
......@@ -126,28 +126,28 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np")
input_image_proc = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
for key in input_image_proc.keys():
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
......@@ -159,10 +159,10 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
......@@ -176,10 +176,10 @@ class CLIPProcessorTest(unittest.TestCase):
processor()
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
......@@ -189,10 +189,10 @@ class CLIPProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
......
......@@ -24,13 +24,13 @@ import pytest
from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
if is_vision_available():
from PIL import Image
from transformers import CLIPSegProcessor, ViTFeatureExtractor
from transformers import CLIPSegProcessor, ViTImageProcessor
@require_vision
......@@ -52,7 +52,7 @@ class CLIPSegProcessorTest(unittest.TestCase):
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
feature_extractor_map = {
image_processor_map = {
"do_resize": True,
"size": 20,
"do_center_crop": True,
......@@ -61,9 +61,9 @@ class CLIPSegProcessorTest(unittest.TestCase):
"image_mean": [0.48145466, 0.4578275, 0.40821073],
"image_std": [0.26862954, 0.26130258, 0.27577711],
}
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp)
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs):
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
......@@ -71,8 +71,8 @@ class CLIPSegProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs):
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs):
return ViTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def get_image_processor(self, **kwargs):
return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
......@@ -90,13 +90,13 @@ class CLIPSegProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname)
processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname)
processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
......@@ -106,17 +106,17 @@ class CLIPSegProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, ViTFeatureExtractor)
self.assertIsInstance(processor_fast.feature_extractor, ViTFeatureExtractor)
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor)
self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor)
def test_save_load_pretrained_additional_features(self):
processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = CLIPSegProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
......@@ -125,28 +125,28 @@ class CLIPSegProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, ViTFeatureExtractor)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, ViTImageProcessor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np")
input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
......@@ -158,10 +158,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
......@@ -175,10 +175,10 @@ class CLIPSegProcessorTest(unittest.TestCase):
processor()
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = CLIPSegProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
......
......@@ -25,13 +25,13 @@ import pytest
from transformers import BertTokenizer, BertTokenizerFast
from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
if is_vision_available():
from PIL import Image
from transformers import FlavaFeatureExtractor, FlavaProcessor
from transformers import FlavaImageProcessor, FlavaProcessor
from transformers.models.flava.image_processing_flava import (
FLAVA_CODEBOOK_MEAN,
FLAVA_CODEBOOK_STD,
......@@ -53,7 +53,7 @@ class FlavaProcessorTest(unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write("".join([x + "\n" for x in vocab_tokens]))
feature_extractor_map = {
image_processor_map = {
"image_mean": FLAVA_IMAGE_MEAN,
"image_std": FLAVA_IMAGE_STD,
"do_normalize": True,
......@@ -77,9 +77,9 @@ class FlavaProcessorTest(unittest.TestCase):
"codebook_image_std": FLAVA_CODEBOOK_STD,
}
self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
json.dump(feature_extractor_map, fp)
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs):
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
......@@ -87,8 +87,8 @@ class FlavaProcessorTest(unittest.TestCase):
def get_rust_tokenizer(self, **kwargs):
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs):
return FlavaFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def get_image_processor(self, **kwargs):
return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
......@@ -107,13 +107,13 @@ class FlavaProcessorTest(unittest.TestCase):
def test_save_load_pretrained_default(self):
tokenizer_slow = self.get_tokenizer()
tokenizer_fast = self.get_rust_tokenizer()
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor)
processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
processor_slow.save_pretrained(self.tmpdirname)
processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False)
processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor)
processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
processor_fast.save_pretrained(self.tmpdirname)
processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname)
......@@ -123,17 +123,17 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor_slow.feature_extractor, FlavaFeatureExtractor)
self.assertIsInstance(processor_fast.feature_extractor, FlavaFeatureExtractor)
self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor)
self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor)
def test_save_load_pretrained_additional_features(self):
processor = FlavaProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = FlavaProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
......@@ -142,18 +142,18 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, FlavaFeatureExtractor)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, FlavaImageProcessor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_feat_extract = feature_extractor(image_input, return_tensors="np")
input_feat_extract = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
for key in input_feat_extract.keys():
......@@ -161,7 +161,7 @@ class FlavaProcessorTest(unittest.TestCase):
# With rest of the args
random.seed(1234)
input_feat_extract = feature_extractor(
input_feat_extract = image_processor(
image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
)
random.seed(1234)
......@@ -173,10 +173,10 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
......@@ -188,10 +188,10 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
......@@ -220,10 +220,10 @@ class FlavaProcessorTest(unittest.TestCase):
processor()
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
......@@ -233,10 +233,10 @@ class FlavaProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = FlavaProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
......
......@@ -31,7 +31,7 @@ from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytes
if is_pytesseract_available():
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
from transformers import LayoutLMv2ImageProcessor, LayoutLMv2Processor
@require_pytesseract
......@@ -59,7 +59,7 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
"lowest",
]
feature_extractor_map = {
image_processor_map = {
"do_resize": True,
"size": 224,
"apply_ocr": True,
......@@ -69,9 +69,9 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n")
self.image_processing_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.image_processing_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(image_processor_map) + "\n")
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
......@@ -82,8 +82,8 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
def get_feature_extractor(self, **kwargs):
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def get_image_processor(self, **kwargs):
return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
......@@ -100,10 +100,10 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
return image_inputs
def test_save_load_pretrained_default(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)
processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)
......@@ -111,16 +111,16 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
def test_save_load_pretrained_additional_features(self):
processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
processor = LayoutLMv2Processor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
processor.save_pretrained(self.tmpdirname)
# slow tokenizer
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
......@@ -129,12 +129,12 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
# fast tokenizer
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
......@@ -143,14 +143,14 @@ class LayoutLMv2ProcessorTest(unittest.TestCase):
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
def test_model_input_names(self):
feature_extractor = self.get_feature_extractor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = LayoutLMv2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = LayoutLMv2Processor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
......@@ -220,15 +220,15 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_1(self):
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
feature_extractor = LayoutLMv2FeatureExtractor()
image_processor = LayoutLMv2ImageProcessor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
input_image_proc = image_processor(images[0], return_tensors="pt")
input_processor = processor(images[0], return_tensors="pt")
# verify keys
......@@ -237,9 +237,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertListEqual(actual_keys, expected_keys)
# verify image
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
# verify input_ids
# this was obtained with Tesseract 4.1.1
......@@ -250,7 +248,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertSequenceEqual(decoding, expected_decoding)
# batched
input_feat_extract = feature_extractor(images, return_tensors="pt")
input_image_proc = image_processor(images, return_tensors="pt")
input_processor = processor(images, padding=True, return_tensors="pt")
# verify keys
......@@ -259,9 +257,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertListEqual(actual_keys, expected_keys)
# verify images
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
self.assertAlmostEqual(input_image_proc["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2)
# verify input_ids
# this was obtained with Tesseract 4.1.1
......@@ -275,12 +271,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_2(self):
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
words = ["hello", "world"]
......@@ -329,12 +325,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_3(self):
# case 3: token classification (training), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
words = ["weirdly", "world"]
......@@ -394,12 +390,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_4(self):
# case 4: visual question answering (inference), apply_ocr=True
feature_extractor = LayoutLMv2FeatureExtractor()
image_processor = LayoutLMv2ImageProcessor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
......@@ -445,12 +441,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
def test_processor_case_5(self):
# case 5: visual question answering (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv2Processor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment