Update modeling doc strings FE -> IP (#21106)

* Update docs examples FE -> IP * Remove _IMAGE_PROCESSOR_FOR_DOC

Update modeling doc strings FE -> IP (#21106)
* Update docs examples FE -> IP * Remove _IMAGE_PROCESSOR_FOR_DOC
91c2278b · amyeroberts · GitHub · 5d3cb760 · 91c2278b · 91c2278b
Unverified Commit 91c2278b authored Jan 20, 2023 by amyeroberts Committed by GitHub Jan 20, 2023
10 changed files
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)

 # General docstring
 _CONFIG_FOR_DOC = "Swinv2Config"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"

 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
@@ -1043,7 +1042,6 @@ class Swinv2Model(Swinv2PreTrainedModel):

    @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Swinv2ModelOutput,
        config_class=_CONFIG_FOR_DOC,
@@ -1251,7 +1249,6 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel):

    @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=Swinv2ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,

--- a/src/transformers/models/van/modeling_van.py
+++ b/src/transformers/models/van/modeling_van.py
@@ -38,7 +38,6 @@ logger = logging.get_logger(__name__)

 # General docstring
 _CONFIG_FOR_DOC = "VanConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"

 # Base docstring
 _CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
@@ -435,7 +434,6 @@ class VanModel(VanPreTrainedModel):

    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
@@ -493,7 +491,6 @@ class VanForImageClassification(VanPreTrainedModel):

    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,

--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -635,8 +635,8 @@ VILT_INPUTS_DOCSTRING = r"""
            [What are token type IDs?](../glossary#token-type-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
-            [`ViltFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
+            [`ViltImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
@@ -690,8 +690,8 @@ VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRING = r"""
            [What are token type IDs?](../glossary#token-type-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See
-            [`ViltFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
+            [`ViltImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -556,13 +556,13 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
    >>> from transformers import (
    ...     FlaxVisionTextDualEncoderModel,
    ...     VisionTextDualEncoderProcessor,
-    ...     ViTFeatureExtractor,
+    ...     ViTImageProcessor,
    ...     BertTokenizer,
    ... )

    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-    >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
+    >>> image_processor = ViTImageProcesor.from_pretrained("google/vit-base-patch16-224")
+    >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
    >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
    ...     "google/vit-base-patch16-224", "bert-base-uncased"
    ... )

--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -41,7 +41,6 @@ logger = logging.get_logger(__name__)

 # General docstring
 _CONFIG_FOR_DOC = "ViTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"

 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
@@ -670,7 +669,6 @@ class TFViTModel(TFViTPreTrainedModel):
    @unpack_inputs
    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
@@ -764,7 +762,6 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
    @unpack_inputs
    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,

--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -42,7 +42,6 @@ logger = logging.get_logger(__name__)

 # General docstring
 _CONFIG_FOR_DOC = "ViTConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"

 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
@@ -536,7 +535,6 @@ class ViTModel(ViTPreTrainedModel):

    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
@@ -765,7 +763,6 @@ class ViTForImageClassification(ViTPreTrainedModel):

    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,

--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)

 # General docstring
 _CONFIG_FOR_DOC = "ViTHybridConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"

 # Base docstring
 _CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384"
@@ -508,8 +507,8 @@ VIT_START_DOCSTRING = r"""
 VIT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -560,7 +559,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel):

    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
@@ -664,7 +662,6 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel):

    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,

--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -584,7 +584,7 @@ X_CLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -619,7 +619,7 @@ X_CLIP_INPUTS_DOCSTRING = r"""
            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):

--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -53,7 +53,6 @@ logger = logging.get_logger(__name__)

 # General docstring
 _CONFIG_FOR_DOC = "YolosConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "YolosImageProcessor"

 # Base docstring
 _CHECKPOINT_FOR_DOC = "hustvl/yolos-small"
@@ -627,7 +626,6 @@ class YolosModel(YolosPreTrainedModel):

    @add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
-        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,

--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -861,10 +861,10 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")

-    >>> inputs = feature_extractor(image, return_tensors="tf")
+    >>> inputs = image_processor(image, return_tensors="tf")
    >>> outputs = model(**inputs)

    >>> last_hidden_states = outputs.last_hidden_state
@@ -884,10 +884,10 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")

-    >>> inputs = feature_extractor(image, return_tensors="tf")
+    >>> inputs = image_processor(image, return_tensors="tf")
    >>> logits = model(**inputs).logits

    >>> # model predicts one of the 1000 ImageNet classes