Update old existing feature extractor references (#24552)

* Update old existing feature extractor references * Typo * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Address comments from review - update 'feature extractor' Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

Update old existing feature extractor references (#24552)
* Update old existing feature extractor references * Typo * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Address comments from review - update 'feature extractor' Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
ae454f41 · amyeroberts · GitHub · 10c2ac7b · ae454f41 · ae454f41
Unverified Commit ae454f41 authored Jun 29, 2023 by amyeroberts Committed by GitHub Jun 29, 2023
20 changed files
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -513,8 +513,8 @@ TIMESFORMER_START_DOCSTRING = r"""
 TIMESFORMER_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`VideoMAEFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`VideoMAEImageProcessor.preprocess`] for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned

--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
@@ -29,7 +29,7 @@ from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    ViTConfig,
-    ViTFeatureExtractor,
+    ViTImageProcessor,
    ViTModel,
 )
 from transformers.utils import logging
@@ -182,9 +182,9 @@ def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    model.load_state_dict(state_dict)

    # Check outputs on an image
-    feature_extractor = ViTFeatureExtractor(size=encoder_config.image_size)
+    image_processor = ViTImageProcessor(size=encoder_config.image_size)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
-    processor = TrOCRProcessor(feature_extractor, tokenizer)
+    processor = TrOCRProcessor(image_processor, tokenizer)

    pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values


--- a/src/transformers/models/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/van/convert_van_to_pytorch.py
@@ -30,7 +30,7 @@ import torch.nn as nn
 from huggingface_hub import cached_download, hf_hub_download
 from torch import Tensor

-from transformers import AutoFeatureExtractor, VanConfig, VanForImageClassification
+from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
 from transformers.models.van.modeling_van import VanLayerScaling
 from transformers.utils import logging

@@ -154,10 +154,10 @@ def convert_weight_and_push(
        )

        # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+        image_processor.push_to_hub(
            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
            use_temp_dir=True,
        )

@@ -277,7 +277,7 @@ if __name__ == "__main__":
        default=True,
        type=bool,
        required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
    )

    args = parser.parse_args()

--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -24,9 +24,9 @@ from huggingface_hub import hf_hub_download

 from transformers import (
    VideoMAEConfig,
-    VideoMAEFeatureExtractor,
    VideoMAEForPreTraining,
    VideoMAEForVideoClassification,
+    VideoMAEImageProcessor,
 )


@@ -198,9 +198,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
    model.eval()

    # verify model on basic input
-    feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
    video = prepare_video()
-    inputs = feature_extractor(video, return_tensors="pt")
+    inputs = image_processor(video, return_tensors="pt")

    if "finetuned" not in model_name:
        local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
@@ -288,8 +288,8 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
        print("Loss ok!")

    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
        model.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:

--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
@@ -27,11 +27,11 @@ from PIL import Image
 from transformers import (
    BertTokenizer,
    ViltConfig,
-    ViltFeatureExtractor,
    ViltForImageAndTextRetrieval,
    ViltForImagesAndTextClassification,
    ViltForMaskedLM,
    ViltForQuestionAnswering,
+    ViltImageProcessor,
    ViltProcessor,
 )
 from transformers.utils import logging
@@ -223,9 +223,9 @@ def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
        model.load_state_dict(state_dict)

    # Define processor
-    feature_extractor = ViltFeatureExtractor(size=384)
+    image_processor = ViltImageProcessor(size=384)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    processor = ViltProcessor(feature_extractor, tokenizer)
+    processor = ViltProcessor(image_processor, tokenizer)

    # Forward pass on example inputs (image + text)
    if nlvr_model:

--- a/src/transformers/models/vit/convert_dino_to_pytorch.py
+++ b/src/transformers/models/vit/convert_dino_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image

-from transformers import ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
+from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
 from transformers.utils import logging


@@ -175,9 +175,9 @@ def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

-    # Check outputs on an image, prepared by ViTFeatureExtractor
-    feature_extractor = ViTFeatureExtractor()
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    # Check outputs on an image, prepared by ViTImageProcessor
+    image_processor = ViTImageProcessor()
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

@@ -192,8 +192,8 @@ def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)


 if __name__ == "__main__":

--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -25,7 +25,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image

-from transformers import DeiTFeatureExtractor, ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
+from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
 from transformers.utils import logging


@@ -208,12 +208,12 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
        model = ViTForImageClassification(config).eval()
    model.load_state_dict(state_dict)

-    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
+    # Check outputs on an image, prepared by ViTImageProcessor/DeiTImageProcessor
    if "deit" in vit_name:
-        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
+        image_processor = DeiTImageProcessor(size=config.image_size)
    else:
-        feature_extractor = ViTFeatureExtractor(size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+        image_processor = ViTImageProcessor(size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
    pixel_values = encoding["pixel_values"]
    outputs = model(pixel_values)

@@ -229,8 +229,8 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)


 if __name__ == "__main__":

--- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+++ b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
@@ -20,7 +20,7 @@ import requests
 import torch
 from PIL import Image

-from transformers import ViTMAEConfig, ViTMAEFeatureExtractor, ViTMAEForPreTraining
+from transformers import ViTMAEConfig, ViTMAEForPreTraining, ViTMAEImageProcessor


 def rename_key(name):
@@ -120,7 +120,7 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):

    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]

-    feature_extractor = ViTMAEFeatureExtractor(size=config.image_size)
+    image_processor = ViTMAEImageProcessor(size=config.image_size)

    new_state_dict = convert_state_dict(state_dict, config)

@@ -130,8 +130,8 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"

    image = Image.open(requests.get(url, stream=True).raw)
-    feature_extractor = ViTMAEFeatureExtractor(size=config.image_size)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    image_processor = ViTMAEImageProcessor(size=config.image_size)
+    inputs = image_processor(images=image, return_tensors="pt")

    # forward pass
    torch.manual_seed(2)
@@ -157,8 +157,8 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)

-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)


 if __name__ == "__main__":

--- a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
+++ b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
@@ -22,7 +22,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image

-from transformers import ViTFeatureExtractor, ViTMSNConfig, ViTMSNModel
+from transformers import ViTImageProcessor, ViTMSNConfig, ViTMSNModel
 from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD


@@ -180,7 +180,7 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):

    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["target_encoder"]

-    feature_extractor = ViTFeatureExtractor(size=config.image_size)
+    image_processor = ViTImageProcessor(size=config.image_size)

    remove_projection_head(state_dict)
    rename_keys = create_rename_keys(config, base_model=True)
@@ -195,10 +195,10 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"

    image = Image.open(requests.get(url, stream=True).raw)
-    feature_extractor = ViTFeatureExtractor(
+    image_processor = ViTImageProcessor(
        size=config.image_size, image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD
    )
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")

    # forward pass
    torch.manual_seed(2)
@@ -224,8 +224,8 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)

-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)


 if __name__ == "__main__":

--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -23,7 +23,7 @@ from huggingface_hub import hf_hub_download
 from transformers import (
    CLIPTokenizer,
    CLIPTokenizerFast,
-    VideoMAEFeatureExtractor,
+    VideoMAEImageProcessor,
    XCLIPConfig,
    XCLIPModel,
    XCLIPProcessor,
@@ -291,10 +291,10 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
    model.eval()

    size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
-    feature_extractor = VideoMAEFeatureExtractor(size=size)
+    image_processor = VideoMAEImageProcessor(size=size)
    slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
    fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-    processor = XCLIPProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
+    processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)

    video = prepare_video(num_frames)
    inputs = processor(

--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image

-from transformers import YolosConfig, YolosFeatureExtractor, YolosForObjectDetection
+from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor
 from transformers.utils import logging


@@ -172,10 +172,10 @@ def convert_yolos_checkpoint(
    new_state_dict = convert_state_dict(state_dict, model)
    model.load_state_dict(new_state_dict)

-    # Check outputs on an image, prepared by YolosFeatureExtractor
+    # Check outputs on an image, prepared by YolosImageProcessor
    size = 800 if yolos_name != "yolos_ti" else 512
-    feature_extractor = YolosFeatureExtractor(format="coco_detection", size=size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = YolosImageProcessor(format="coco_detection", size=size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
    outputs = model(**encoding)
    logits, pred_boxes = outputs.logits, outputs.pred_boxes

@@ -224,8 +224,8 @@ def convert_yolos_checkpoint(
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        model_mapping = {
@@ -238,7 +238,7 @@ def convert_yolos_checkpoint(

        print("Pushing to the hub...")
        model_name = model_mapping[yolos_name]
-        feature_extractor.push_to_hub(model_name, organization="hustvl")
+        image_processor.push_to_hub(model_name, organization="hustvl")
        model.push_to_hub(model_name, organization="hustvl")



--- a/src/transformers/onnx/__main__.py
+++ b/src/transformers/onnx/__main__.py
@@ -19,7 +19,7 @@ from pathlib import Path

 from packaging import version

-from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
+from .. import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer
 from ..utils import logging
 from ..utils.import_utils import is_optimum_available
 from .convert import export, validate_model_outputs
@@ -145,6 +145,8 @@ def export_with_transformers(args):
            preprocessor = get_preprocessor(args.model)
        elif args.preprocessor == "tokenizer":
            preprocessor = AutoTokenizer.from_pretrained(args.model)
+        elif args.preprocessor == "image_processor":
+            preprocessor = AutoImageProcessor.from_pretrained(args.model)
        elif args.preprocessor == "feature_extractor":
            preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
        elif args.preprocessor == "processor":
@@ -213,7 +215,7 @@ def main():
    parser.add_argument(
        "--preprocessor",
        type=str,
-        choices=["auto", "tokenizer", "feature_extractor", "processor"],
+        choices=["auto", "tokenizer", "feature_extractor", "image_processor", "processor"],
        default="auto",
        help="Which type of preprocessor to use. 'auto' tries to automatically detect it.",
    )

--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -49,7 +49,7 @@ if is_vision_available():
    import PIL
    from PIL import Image

-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor


 class BeitModelTester:
@@ -342,18 +342,16 @@ def prepare_img():
 @require_vision
 class BeitModelIntegrationTest(unittest.TestCase):
    @cached_property
-    def default_feature_extractor(self):
-        return (
-            BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None

    @slow
    def test_inference_masked_image_modeling_head(self):
        model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k").to(torch_device)

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)

        # prepare bool_masked_pos
        bool_masked_pos = torch.ones((1, 196), dtype=torch.bool).to(torch_device)
@@ -377,9 +375,9 @@ class BeitModelIntegrationTest(unittest.TestCase):
    def test_inference_image_classification_head_imagenet_1k(self):
        model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224").to(torch_device)

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
@@ -403,9 +401,9 @@ class BeitModelIntegrationTest(unittest.TestCase):
            torch_device
        )

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
@@ -428,11 +426,11 @@ class BeitModelIntegrationTest(unittest.TestCase):
        model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        model = model.to(torch_device)

-        feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
+        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        image = Image.open(ds[0]["file"])
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
@@ -471,11 +469,11 @@ class BeitModelIntegrationTest(unittest.TestCase):
        model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        model = model.to(torch_device)

-        feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
+        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        image = Image.open(ds[0]["file"])
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
@@ -483,10 +481,10 @@ class BeitModelIntegrationTest(unittest.TestCase):

        outputs.logits = outputs.logits.detach().cpu()

-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
        expected_shape = torch.Size((500, 300))
        self.assertEqual(segmentation[0].shape, expected_shape)

-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
        expected_shape = torch.Size((160, 160))
        self.assertEqual(segmentation[0].shape, expected_shape)
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ b/tests/models/beit/test_modeling_flax_beit.py
@@ -33,7 +33,7 @@ if is_flax_available():
 if is_vision_available():
    from PIL import Image

-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor


 class FlaxBeitModelTester(unittest.TestCase):
@@ -219,18 +219,16 @@ def prepare_img():
 @require_flax
 class FlaxBeitModelIntegrationTest(unittest.TestCase):
    @cached_property
-    def default_feature_extractor(self):
-        return (
-            BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None

    @slow
    def test_inference_masked_image_modeling_head(self):
        model = FlaxBeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
+        pixel_values = image_processor(images=image, return_tensors="np").pixel_values

        # prepare bool_masked_pos
        bool_masked_pos = np.ones((1, 196), dtype=bool)
@@ -253,9 +251,9 @@ class FlaxBeitModelIntegrationTest(unittest.TestCase):
    def test_inference_image_classification_head_imagenet_1k(self):
        model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")

        # forward pass
        outputs = model(**inputs)
@@ -276,9 +274,9 @@ class FlaxBeitModelIntegrationTest(unittest.TestCase):
    def test_inference_image_classification_head_imagenet_22k(self):
        model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-large-patch16-224-pt22k-ft22k")

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")

        # forward pass
        outputs = model(**inputs)

--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -297,7 +297,7 @@ def prepare_img():
 @require_vision
 class BitModelIntegrationTest(unittest.TestCase):
    @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
        return (
            BitImageProcessor.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]) if is_vision_available() else None
        )
@@ -306,9 +306,9 @@ class BitModelIntegrationTest(unittest.TestCase):
    def test_inference_image_classification_head(self):
        model = BitForImageClassification.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():

--- a/tests/models/bridgetower/test_image_processing_bridgetower.py
+++ b/tests/models/bridgetower/test_image_processing_bridgetower.py
@@ -145,7 +145,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
        pass

    def test_call_pil(self):
-        # Initialize feature_extractor
+        # Initialize image processor
        image_processing = self.image_processing_class(**self.image_processor_dict)
        # create random PIL images
        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False)
@@ -176,7 +176,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
        )

    def test_call_numpy(self):
-        # Initialize feature_extractor
+        # Initialize image processor
        image_processing = self.image_processing_class(**self.image_processor_dict)
        # create random numpy tensors
        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
@@ -207,7 +207,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
        )

    def test_call_pytorch(self):
-        # Initialize feature_extractor
+        # Initialize image processor
        image_processing = self.image_processing_class(**self.image_processor_dict)
        # create random PyTorch tensors
        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
@@ -238,7 +238,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te
        )

    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize feature_extractors
+        # Initialize image processors
        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
        # create random PyTorch tensors

--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -43,7 +43,7 @@ if is_timm_available():
 if is_vision_available():
    from PIL import Image

-    from transformers import ConditionalDetrFeatureExtractor
+    from transformers import ConditionalDetrImageProcessor


 class ConditionalDetrModelTester:
@@ -493,9 +493,9 @@ def prepare_img():
 @slow
 class ConditionalDetrModelIntegrationTests(unittest.TestCase):
    @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
        return (
-            ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+            ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
            if is_vision_available()
            else None
        )
@@ -503,9 +503,9 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
    def test_inference_no_head(self):
        model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)

        with torch.no_grad():
            outputs = model(**encoding)
@@ -522,9 +522,9 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
            torch_device
        )

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
        pixel_values = encoding["pixel_values"].to(torch_device)
        pixel_mask = encoding["pixel_mask"].to(torch_device)

@@ -547,7 +547,7 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))

        # verify postprocessing
-        results = feature_extractor.post_process_object_detection(
+        results = image_processor.post_process_object_detection(
            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
        )[0]
        expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355]).to(torch_device)

--- a/tests/models/convnext/test_modeling_convnext.py
+++ b/tests/models/convnext/test_modeling_convnext.py
@@ -38,7 +38,7 @@ if is_torch_available():
 if is_vision_available():
    from PIL import Image

-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor


 class ConvNextModelTester:
@@ -285,16 +285,16 @@ def prepare_img():
 @require_vision
 class ConvNextModelIntegrationTest(unittest.TestCase):
    @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None

    @slow
    def test_inference_image_classification_head(self):
        model = ConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224").to(torch_device)

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():

--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ b/tests/models/convnext/test_modeling_tf_convnext.py
@@ -38,7 +38,7 @@ if is_tf_available():
 if is_vision_available():
    from PIL import Image

-    from transformers import ConvNextFeatureExtractor
+    from transformers import ConvNextImageProcessor


 class TFConvNextModelTester:
@@ -279,18 +279,16 @@ def prepare_img():
 @require_vision
 class TFConvNextModelIntegrationTest(unittest.TestCase):
    @cached_property
-    def default_feature_extractor(self):
-        return (
-            ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None

    @slow
    def test_inference_image_classification_head(self):
        model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")

        # forward pass
        outputs = model(**inputs)

--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -38,7 +38,7 @@ if is_torch_available():
 if is_vision_available():
    from PIL import Image

-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor


 class CvtConfigTester(ConfigTester):
@@ -264,16 +264,16 @@ def prepare_img():
 @require_vision
 class CvtModelIntegrationTest(unittest.TestCase):
    @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])

    @slow
    def test_inference_image_classification_head(self):
        model = CvtForImageClassification.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)

-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():