Update old existing feature extractor references (#24552)

* Update old existing feature extractor references * Typo * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Address comments from review - update 'feature extractor' Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

Update old existing feature extractor references (#24552)
* Update old existing feature extractor references * Typo * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Address comments from review - update 'feature extractor' Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
ae454f41 · amyeroberts · GitHub · 10c2ac7b · ae454f41 · ae454f41
Unverified Commit ae454f41 authored Jun 29, 2023 by amyeroberts Committed by GitHub Jun 29, 2023
20 changed files
--- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
@@ -8,7 +8,7 @@ from PIL import Image
 from timm.models import create_model

 from transformers import (
-    BeitFeatureExtractor,
+    BeitImageProcessor,
    Data2VecVisionConfig,
    Data2VecVisionForImageClassification,
    Data2VecVisionModel,
@@ -304,9 +304,9 @@ def main():
    orig_model.eval()

    # 3. Forward Beit model
-    feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
+    image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
    image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
@@ -354,7 +354,7 @@ def main():
    # 7. Save
    print(f"Saving to {args.hf_checkpoint_name}")
    hf_model.save_pretrained(args.hf_checkpoint_name)
-    feature_extractor.save_pretrained(args.hf_checkpoint_name)
+    image_processor.save_pretrained(args.hf_checkpoint_name)


 if __name__ == "__main__":

--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image

-from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection
+from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
 from transformers.utils import logging


@@ -115,12 +115,12 @@ def convert_deformable_detr_checkpoint(
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}

-    # load feature extractor
-    feature_extractor = DeformableDetrFeatureExtractor(format="coco_detection")
+    # load image processor
+    image_processor = DeformableDetrImageProcessor(format="coco_detection")

    # prepare image
    img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    logger.info("Converting model...")
@@ -185,11 +185,11 @@ def convert_deformable_detr_checkpoint(

    print("Everything ok!")

-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)

    # Push to hub
    if push_to_hub:

--- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+++ b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
@@ -25,7 +25,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image

-from transformers import DeiTConfig, DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
+from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
 from transformers.utils import logging


@@ -182,12 +182,12 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
    model = DeiTForImageClassificationWithTeacher(config).eval()
    model.load_state_dict(state_dict)

-    # Check outputs on an image, prepared by DeiTFeatureExtractor
+    # Check outputs on an image, prepared by DeiTImageProcessor
    size = int(
        (256 / 224) * config.image_size
    )  # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
-    feature_extractor = DeiTFeatureExtractor(size=size, crop_size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

@@ -198,8 +198,8 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)


 if __name__ == "__main__":

--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image

-from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation
+from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
 from transformers.utils import logging


@@ -201,13 +201,13 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

-    # load feature extractor
+    # load image processor
    format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = DetrFeatureExtractor(format=format)
+    image_processor = DetrImageProcessor(format=format)

    # prepare image
    img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    logger.info(f"Converting model {model_name}...")
@@ -258,11 +258,11 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
    if is_panoptic:
        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)

-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)


 if __name__ == "__main__":

--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -1341,8 +1341,7 @@ class DetrImageProcessor(BaseImageProcessor):

        Args:
            results (`List[Dict]`):
-                Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks" results will be
-                added.
+                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
            outputs ([`DetrSegmentationOutput`]):
                Raw outputs of the model.
            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):

--- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+++ b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image

-from transformers import BeitConfig, BeitFeatureExtractor, BeitForImageClassification, BeitForMaskedImageModeling
+from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging

@@ -171,12 +171,12 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
    model.load_state_dict(state_dict)

    # Check outputs on an image
-    feature_extractor = BeitFeatureExtractor(
+    image_processor = BeitImageProcessor(
        size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
    )
    image = prepare_img()

-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
    pixel_values = encoding["pixel_values"]

    outputs = model(pixel_values)
@@ -189,18 +189,18 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        if has_lm_head:
            model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
        else:
            model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
            use_temp_dir=True,
        )
        model.push_to_hub(

--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -21,7 +21,7 @@ from datasets import load_dataset
 from donut import DonutModel

 from transformers import (
-    DonutFeatureExtractor,
+    DonutImageProcessor,
    DonutProcessor,
    DonutSwinConfig,
    DonutSwinModel,
@@ -152,10 +152,10 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
    image = dataset["test"][0]["image"].convert("RGB")

    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
-    feature_extractor = DonutFeatureExtractor(
+    image_processor = DonutImageProcessor(
        do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
    )
-    processor = DonutProcessor(feature_extractor, tokenizer)
+    processor = DonutProcessor(image_processor, tokenizer)
    pixel_values = processor(image, return_tensors="pt").pixel_values

    if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":

--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image

-from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
+from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
 from transformers.utils import logging


@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub

    # Check outputs on an image
    size = 480 if "ade" in checkpoint_url else 384
-    feature_extractor = DPTFeatureExtractor(size=size)
+    image_processor = DPTImageProcessor(size=size)

    image = prepare_img()
-    encoding = feature_extractor(image, return_tensors="pt")
+    encoding = image_processor(image, return_tensors="pt")

    # forward pass
    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
@@ -271,12 +271,12 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        print(f"Saving model to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        model.push_to_hub("ybelkada/dpt-hybrid-midas")
-        feature_extractor.push_to_hub("ybelkada/dpt-hybrid-midas")
+        image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")


 if __name__ == "__main__":

--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image

-from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
+from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
 from transformers.utils import logging


@@ -211,10 +211,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub

    # Check outputs on an image
    size = 480 if "ade" in checkpoint_url else 384
-    feature_extractor = DPTFeatureExtractor(size=size)
+    image_processor = DPTImageProcessor(size=size)

    image = prepare_img()
-    encoding = feature_extractor(image, return_tensors="pt")
+    encoding = image_processor(image, return_tensors="pt")

    # forward pass
    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
@@ -233,8 +233,8 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        print("Pushing model to hub...")
@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
            commit_message="Add model",
            use_temp_dir=True,
        )
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
            use_temp_dir=True,
        )


--- a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
@@ -208,7 +208,7 @@ def convert_efficientformer_checkpoint(
        )
        processor.push_to_hub(
            repo_id=f"Bearnardd/{pytorch_dump_path}",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
            use_temp_dir=True,
        )

@@ -234,12 +234,12 @@ if __name__ == "__main__":
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )

-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
    parser.add_argument(
        "--no-push_to_hub",
        dest="push_to_hub",
        action="store_false",
-        help="Do not push model and feature extractor to the hub",
+        help="Do not push model and image processor to the hub",
    )
    parser.set_defaults(push_to_hub=True)


--- a/src/transformers/models/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -537,8 +537,8 @@ EFFICIENTFORMER_START_DOCSTRING = r"""
 EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
-            [`ViTFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
+            [`ViTImageProcessor.preprocess`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

--- a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
+++ b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
@@ -305,12 +305,12 @@ def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_m
        # Create folder to save model
        if not os.path.isdir(pytorch_dump_folder_path):
            os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and feature extractor
+        # Save converted model and image processor
        hf_model.save_pretrained(pytorch_dump_folder_path)
        preprocessor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
-        # Push model and feature extractor to hub
+        # Push model and image processor to hub
        print(f"Pushing converted {model_name} to the hub...")
        model_name = f"efficientnet-{model_name}"
        preprocessor.push_to_hub(model_name)
@@ -333,7 +333,7 @@ if __name__ == "__main__":
        help="Path to the output PyTorch model directory.",
    )
    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")

    args = parser.parse_args()
    convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
@@ -23,7 +23,7 @@ import requests
 import torch
 from PIL import Image

-from transformers import GLPNConfig, GLPNFeatureExtractor, GLPNForDepthEstimation
+from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
 from transformers.utils import logging


@@ -131,12 +131,12 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
    # load GLPN configuration (Segformer-B4 size)
    config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])

-    # load feature extractor (only resize + rescale)
-    feature_extractor = GLPNFeatureExtractor()
+    # load image processor (only resize + rescale)
+    image_processor = GLPNImageProcessor()

    # prepare image
    image = prepare_img()
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values

    logger.info("Converting model...")

@@ -179,17 +179,17 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h

    # finally, push to hub if required
    if push_to_hub:
-        logger.info("Pushing model and feature extractor to the hub...")
+        logger.info("Pushing model and image processor to the hub...")
        model.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
            commit_message="Add model",
            use_temp_dir=True,
        )
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
            use_temp_dir=True,
        )


--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -458,7 +458,7 @@ class GroupViTOnnxConfig(OnnxConfig):
            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
        )
        image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
        )
        return {**text_input_dict, **image_input_dict}


--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -81,7 +81,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):

    def __init__(
        self,
-        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTFeatureExtractor
+        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
        clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
        do_resize: bool = True,
        size: Dict[str, int] = None,

--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -260,7 +260,7 @@ class LayoutLMv3OnnxConfig(OnnxConfig):
        """

        # A dummy image is used so OCR should not be applied
-        setattr(processor.feature_extractor, "apply_ocr", False)
+        setattr(processor.image_processor, "apply_ocr", False)

        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
        batch_size = compute_effective_axis_dimension(

--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -15,6 +15,7 @@
 """
 Processor class for LayoutXLM.
 """
+import warnings
 from typing import List, Optional, Union

 from ...processing_utils import ProcessorMixin
@@ -24,26 +25,45 @@ from ...utils import TensorType

 class LayoutXLMProcessor(ProcessorMixin):
    r"""
-    Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
-    single processor.
+    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
+    processor.

    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.

-    It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
-    to get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
+    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
    [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
    into token-level `labels` for token classification tasks (such as FUNSD, CORD).

    Args:
-        feature_extractor (`LayoutLMv2FeatureExtractor`):
-            An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
+        image_processor (`LayoutLMv2ImageProcessor`):
+            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
    """
-    feature_extractor_class = "LayoutLMv2FeatureExtractor"
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv2ImageProcessor"
    tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")

+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
    def __call__(
        self,
        images,
@@ -68,37 +88,37 @@ class LayoutXLMProcessor(ProcessorMixin):
        **kwargs,
    ) -> BatchEncoding:
        """
-        This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
-        [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
+        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
        bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
+        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
        arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.

        Please refer to the docstring of the above two methods for more information.
        """
        # verify input
-        if self.feature_extractor.apply_ocr and (boxes is not None):
+        if self.image_processor.apply_ocr and (boxes is not None):
            raise ValueError(
                "You cannot provide bounding boxes "
-                "if you initialized the feature extractor with apply_ocr set to True."
+                "if you initialized the image processor with apply_ocr set to True."
            )

-        if self.feature_extractor.apply_ocr and (word_labels is not None):
+        if self.image_processor.apply_ocr and (word_labels is not None):
            raise ValueError(
-                "You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
            )

        if return_overflowing_tokens is True and return_offsets_mapping is False:
            raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")

-        # first, apply the feature extractor
-        features = self.feature_extractor(images=images, return_tensors=return_tensors)
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)

        # second, apply the tokenizer
-        if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
            if isinstance(text, str):
-                text = [text]  # add batch dimension (as the feature extractor always adds a batch dimension)
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
            text_pair = features["words"]

        encoded_inputs = self.tokenizer(
@@ -162,3 +182,19 @@ class LayoutXLMProcessor(ProcessorMixin):
    @property
    def model_input_names(self):
        return ["input_ids", "bbox", "attention_mask", "image"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
--- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+++ b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
@@ -25,7 +25,7 @@ import timm
 import torch
 from huggingface_hub import hf_hub_download

-from transformers import LevitConfig, LevitFeatureExtractor, LevitForImageClassificationWithTeacher
+from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor
 from transformers.utils import logging


@@ -74,8 +74,8 @@ def convert_weight_and_push(

    if push_to_hub:
        our_model.save_pretrained(save_directory / checkpoint_name)
-        feature_extractor = LevitFeatureExtractor()
-        feature_extractor.save_pretrained(save_directory / checkpoint_name)
+        image_processor = LevitImageProcessor()
+        image_processor.save_pretrained(save_directory / checkpoint_name)

        print(f"Pushed {checkpoint_name}")

@@ -167,12 +167,12 @@ if __name__ == "__main__":
        required=False,
        help="Path to the output PyTorch model directory.",
    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
    parser.add_argument(
        "--no-push_to_hub",
        dest="push_to_hub",
        action="store_false",
-        help="Do not push model and feature extractor to the hub",
+        help="Do not push model and image processor to the hub",
    )

    args = parser.parse_args()

--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
@@ -192,7 +192,7 @@ class OriginalMask2FormerConfigToOursConverter:
        return config


-class OriginalMask2FormerConfigToFeatureExtractorConverter:
+class OriginalMask2FormerConfigToImageProcessorConverter:
    def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
        model = original_config.MODEL
        model_input = original_config.INPUT
@@ -846,7 +846,7 @@ class OriginalMask2FormerCheckpointToOursConverter:
 def test(
    original_model,
    our_model: Mask2FormerForUniversalSegmentation,
-    feature_extractor: Mask2FormerImageProcessor,
+    image_processor: Mask2FormerImageProcessor,
    tolerance: float,
 ):
    with torch.no_grad():
@@ -854,7 +854,7 @@ def test(
        our_model = our_model.eval()

        im = prepare_img()
-        x = feature_extractor(images=im, return_tensors="pt")["pixel_values"]
+        x = image_processor(images=im, return_tensors="pt")["pixel_values"]

        original_model_backbone_features = original_model.backbone(x.clone())
        our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
@@ -979,10 +979,10 @@ if __name__ == "__main__":
        checkpoints_dir, config_dir
    ):
        model_name = get_model_name(checkpoint_file)
-        feature_extractor = OriginalMask2FormerConfigToFeatureExtractorConverter()(
+        image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
            setup_cfg(Args(config_file=config_file))
        )
-        feature_extractor.size = {"height": 384, "width": 384}
+        image_processor.size = {"height": 384, "width": 384}

        original_config = setup_cfg(Args(config_file=config_file))
        mask2former_kwargs = OriginalMask2Former.from_config(original_config)
@@ -1012,8 +1012,8 @@ if __name__ == "__main__":
            tolerance = 3e-1

        logger.info(f"🪄 Testing {model_name}...")
-        test(original_model, mask2former_for_segmentation, feature_extractor, tolerance)
+        test(original_model, mask2former_for_segmentation, image_processor, tolerance)
        logger.info(f"🪄 Pushing {model_name} to hub...")

-        feature_extractor.push_to_hub(model_name)
+        image_processor.push_to_hub(model_name)
        mask2former_for_segmentation.push_to_hub(model_name)
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -2106,8 +2106,8 @@ MASK2FORMER_START_DOCSTRING = r"""
 MASK2FORMER_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.preprocess`] for details.
        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: