Unverified Commit 91c2278b authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update modeling doc strings FE -> IP (#21106)

* Update docs examples FE -> IP

* Remove _IMAGE_PROCESSOR_FOR_DOC
parent 5d3cb760
...@@ -43,7 +43,6 @@ logger = logging.get_logger(__name__) ...@@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "Swinv2Config" _CONFIG_FOR_DOC = "Swinv2Config"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256" _CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
...@@ -1043,7 +1042,6 @@ class Swinv2Model(Swinv2PreTrainedModel): ...@@ -1043,7 +1042,6 @@ class Swinv2Model(Swinv2PreTrainedModel):
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Swinv2ModelOutput, output_type=Swinv2ModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
...@@ -1251,7 +1249,6 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel): ...@@ -1251,7 +1249,6 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel):
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT, checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=Swinv2ImageClassifierOutput, output_type=Swinv2ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
......
...@@ -38,7 +38,6 @@ logger = logging.get_logger(__name__) ...@@ -38,7 +38,6 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "VanConfig" _CONFIG_FOR_DOC = "VanConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base" _CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
...@@ -435,7 +434,6 @@ class VanModel(VanPreTrainedModel): ...@@ -435,7 +434,6 @@ class VanModel(VanPreTrainedModel):
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention, output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
...@@ -493,7 +491,6 @@ class VanForImageClassification(VanPreTrainedModel): ...@@ -493,7 +491,6 @@ class VanForImageClassification(VanPreTrainedModel):
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT, checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention, output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
......
...@@ -635,8 +635,8 @@ VILT_INPUTS_DOCSTRING = r""" ...@@ -635,8 +635,8 @@ VILT_INPUTS_DOCSTRING = r"""
[What are token type IDs?](../glossary#token-type-ids) [What are token type IDs?](../glossary#token-type-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
[`ViltFeatureExtractor.__call__`] for details. [`ViltImageProcessor.__call__`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
...@@ -690,8 +690,8 @@ VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRING = r""" ...@@ -690,8 +690,8 @@ VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRING = r"""
[What are token type IDs?](../glossary#token-type-ids) [What are token type IDs?](../glossary#token-type-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViltFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`ViltImageProcessor`]. See
[`ViltFeatureExtractor.__call__`] for details. [`ViltImageProcessor.__call__`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*): pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
......
...@@ -556,13 +556,13 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r""" ...@@ -556,13 +556,13 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
>>> from transformers import ( >>> from transformers import (
... FlaxVisionTextDualEncoderModel, ... FlaxVisionTextDualEncoderModel,
... VisionTextDualEncoderProcessor, ... VisionTextDualEncoderProcessor,
... ViTFeatureExtractor, ... ViTImageProcessor,
... BertTokenizer, ... BertTokenizer,
... ) ... )
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") >>> image_processor = ViTImageProcesor.from_pretrained("google/vit-base-patch16-224")
>>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer) >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained( >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
... "google/vit-base-patch16-224", "bert-base-uncased" ... "google/vit-base-patch16-224", "bert-base-uncased"
... ) ... )
......
...@@ -41,7 +41,6 @@ logger = logging.get_logger(__name__) ...@@ -41,7 +41,6 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "ViTConfig" _CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k" _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
...@@ -670,7 +669,6 @@ class TFViTModel(TFViTPreTrainedModel): ...@@ -670,7 +669,6 @@ class TFViTModel(TFViTPreTrainedModel):
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling, output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
...@@ -764,7 +762,6 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification ...@@ -764,7 +762,6 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
@unpack_inputs @unpack_inputs
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT, checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput, output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
......
...@@ -42,7 +42,6 @@ logger = logging.get_logger(__name__) ...@@ -42,7 +42,6 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "ViTConfig" _CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k" _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
...@@ -536,7 +535,6 @@ class ViTModel(ViTPreTrainedModel): ...@@ -536,7 +535,6 @@ class ViTModel(ViTPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling, output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
...@@ -765,7 +763,6 @@ class ViTForImageClassification(ViTPreTrainedModel): ...@@ -765,7 +763,6 @@ class ViTForImageClassification(ViTPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT, checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput, output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
......
...@@ -37,7 +37,6 @@ logger = logging.get_logger(__name__) ...@@ -37,7 +37,6 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "ViTHybridConfig" _CONFIG_FOR_DOC = "ViTHybridConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384" _CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384"
...@@ -508,8 +507,8 @@ VIT_START_DOCSTRING = r""" ...@@ -508,8 +507,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r""" VIT_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
...@@ -560,7 +559,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel): ...@@ -560,7 +559,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling, output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
...@@ -664,7 +662,6 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel): ...@@ -664,7 +662,6 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
@add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT, checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput, output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
......
...@@ -584,7 +584,7 @@ X_CLIP_VISION_INPUTS_DOCSTRING = r""" ...@@ -584,7 +584,7 @@ X_CLIP_VISION_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details. [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. tensors for more detail.
...@@ -619,7 +619,7 @@ X_CLIP_INPUTS_DOCSTRING = r""" ...@@ -619,7 +619,7 @@ X_CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids) [What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details. [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*): return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss. Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
......
...@@ -53,7 +53,6 @@ logger = logging.get_logger(__name__) ...@@ -53,7 +53,6 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "YolosConfig" _CONFIG_FOR_DOC = "YolosConfig"
_FEAT_EXTRACTOR_FOR_DOC = "YolosImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "hustvl/yolos-small" _CHECKPOINT_FOR_DOC = "hustvl/yolos-small"
...@@ -627,7 +626,6 @@ class YolosModel(YolosPreTrainedModel): ...@@ -627,7 +626,6 @@ class YolosModel(YolosPreTrainedModel):
@add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling, output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
......
...@@ -861,10 +861,10 @@ TF_VISION_BASE_MODEL_SAMPLE = r""" ...@@ -861,10 +861,10 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
>>> dataset = load_dataset("huggingface/cats-image") >>> dataset = load_dataset("huggingface/cats-image")
>>> image = dataset["test"]["image"][0] >>> image = dataset["test"]["image"][0]
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") >>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}") >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = feature_extractor(image, return_tensors="tf") >>> inputs = image_processor(image, return_tensors="tf")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state >>> last_hidden_states = outputs.last_hidden_state
...@@ -884,10 +884,10 @@ TF_VISION_SEQ_CLASS_SAMPLE = r""" ...@@ -884,10 +884,10 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
>>> dataset = load_dataset("huggingface/cats-image") >>> dataset = load_dataset("huggingface/cats-image")
>>> image = dataset["test"]["image"][0] >>> image = dataset["test"]["image"][0]
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") >>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}") >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = feature_extractor(image, return_tensors="tf") >>> inputs = image_processor(image, return_tensors="tf")
>>> logits = model(**inputs).logits >>> logits = model(**inputs).logits
>>> # model predicts one of the 1000 ImageNet classes >>> # model predicts one of the 1000 ImageNet classes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment