Unverified Commit 91c2278b authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update modeling doc strings FE -> IP (#21106)

* Update docs examples FE -> IP

* Remove _IMAGE_PROCESSOR_FOR_DOC
parent 5d3cb760
......@@ -98,7 +98,7 @@ ALTCLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......@@ -133,7 +133,7 @@ ALTCLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
......
......@@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "BeitConfig"
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/beit-base-patch16-224-pt22k"
......@@ -646,7 +645,6 @@ class BeitModel(BeitPreTrainedModel):
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BeitModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
......@@ -844,7 +842,6 @@ class BeitForImageClassification(BeitPreTrainedModel):
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
......
......@@ -46,7 +46,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "BitConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/bit-50"
......@@ -688,8 +687,8 @@ BIT_START_DOCSTRING = r"""
BIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
......@@ -723,7 +722,6 @@ class BitModel(BitPreTrainedModel):
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
......@@ -782,7 +780,6 @@ class BitForImageClassification(BitPreTrainedModel):
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
......
......@@ -810,7 +810,7 @@ CHINESE_CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
[`ChineseCLIPImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......@@ -853,7 +853,7 @@ CHINESE_CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`ChineseCLIPFeatureExtractor`]. See [`ChineseCLIPFeatureExtractor.__call__`] for details.
[`ChineseCLIPImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
......
......@@ -521,7 +521,7 @@ CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......@@ -556,7 +556,7 @@ CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
......
......@@ -108,7 +108,7 @@ CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......@@ -143,7 +143,7 @@ CLIP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......@@ -893,7 +893,7 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
using [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
using [`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
Returns:
image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by
......
......@@ -993,8 +993,8 @@ CLIP_TEXT_INPUTS_DOCSTRING = r"""
CLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
return the attentions tensors of all attention layers. See `attentions` under returned tensors for more
detail. This argument can be used only in eager mode, in graph mode the value in the config will be used
instead.
......@@ -1020,8 +1020,8 @@ CLIP_INPUTS_DOCSTRING = r"""
[What are input IDs?](../glossary#input-ids)
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
[`CLIPFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`CLIPImageProcessor`]. See
[`CLIPImageProcessor.__call__`] for details.
attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
......
......@@ -530,7 +530,7 @@ CLIPSEG_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......@@ -565,7 +565,7 @@ CLIPSEG_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
......
......@@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ConvNextConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ConvNextImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/convnext-tiny-224"
......@@ -346,7 +345,6 @@ class ConvNextModel(ConvNextPreTrainedModel):
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
......@@ -414,7 +412,6 @@ class ConvNextForImageClassification(ConvNextPreTrainedModel):
@add_start_docstrings_to_model_forward(CONVNEXT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
......
......@@ -35,7 +35,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "CvtConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
......@@ -605,7 +604,6 @@ class CvtModel(CvtPreTrainedModel):
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithCLSToken,
config_class=_CONFIG_FOR_DOC,
......@@ -668,7 +666,6 @@ class CvtForImageClassification(CvtPreTrainedModel):
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
......
......@@ -48,7 +48,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
......@@ -660,7 +659,6 @@ class Data2VecVisionModel(Data2VecVisionPreTrainedModel):
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Data2VecVisionModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
......@@ -760,7 +758,6 @@ class Data2VecVisionForImageClassification(Data2VecVisionPreTrainedModel):
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
......
......@@ -53,7 +53,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
_FEAT_EXTRACTOR_FOR_DOC = "BeitImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
......@@ -894,7 +893,6 @@ class TFData2VecVisionModel(TFData2VecVisionPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFData2VecVisionModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
......@@ -960,7 +958,6 @@ class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TF
@unpack_inputs
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
......
......@@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DeiTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
......@@ -483,7 +482,6 @@ class DeiTModel(DeiTPreTrainedModel):
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
......@@ -854,7 +852,6 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=DeiTForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,
......
......@@ -52,7 +52,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DeiTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "DeiTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
......@@ -651,7 +650,6 @@ class TFDeiTModel(TFDeiTPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
......@@ -1009,7 +1007,6 @@ class TFDeiTForImageClassificationWithTeacher(TFDeiTPreTrainedModel):
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFDeiTForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,
......
......@@ -57,7 +57,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DinatConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224"
......@@ -730,7 +729,6 @@ class DinatModel(DinatPreTrainedModel):
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DinatModelOutput,
config_class=_CONFIG_FOR_DOC,
......@@ -810,7 +808,6 @@ class DinatForImageClassification(DinatPreTrainedModel):
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=DinatImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
......
......@@ -43,7 +43,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DonutSwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
# Base docstring
_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
......@@ -847,8 +846,8 @@ SWIN_START_DOCSTRING = r"""
SWIN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......@@ -898,7 +897,6 @@ class DonutSwinModel(DonutSwinPreTrainedModel):
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DonutSwinModelOutput,
config_class=_CONFIG_FOR_DOC,
......
......@@ -49,7 +49,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DPTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "DPTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "Intel/dpt-large"
......@@ -898,7 +897,6 @@ class DPTModel(DPTPreTrainedModel):
@add_start_docstrings_to_model_forward(DPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndIntermediateActivations,
config_class=_CONFIG_FOR_DOC,
......
......@@ -51,7 +51,6 @@ _CHECKPOINT_FOR_DOC = "facebook/flava-full"
# Codebook docstring
_CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook"
_FEAT_EXTRACTOR_FOR_DOC = "FlavaFeatureExtractor"
_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC = "FlavaImageConfig"
_CONFIG_CLASS_FOR_TEXT_MODEL_DOC = "FlavaTextConfig"
_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOC = "FlavaMultimodalConfig"
......@@ -750,8 +749,8 @@ FLAVA_INPUTS_DOCSTRING_COMMON = r"""
FLAVA_IMAGE_INPUTS_DOCSTRING_BASE = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`FlavaFeatureExtractor`]. See
[`FlavaFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`FlavaImageProcessor`]. See
[`FlavaImageProcessor.__call__`] for details.
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
......@@ -926,7 +925,6 @@ class FlavaImageModel(FlavaPreTrainedModel):
@add_start_docstrings_to_model_forward(FLAVA_IMAGE_INPUTS_DOCSTRING.format("batch_size, image_num_patches"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC,
......@@ -1568,22 +1566,22 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
`return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
Pixel values. Codebook pixel values can be obtained using [`FlavaImageProcessor`] by passing
`return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
>>> from transformers import FlavaImageProcessor, FlavaImageCodebook
>>> model = FlavaImageCodebook.from_pretrained("{0}")
>>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
>>> image_processor = FlavaImageProcessor.from_pretrained("{0}")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
>>> outputs = model.get_codebook_indices(**inputs)
......@@ -1602,23 +1600,23 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Codebook pixel values can be obtained using [`FlavaFeatureExtractor`] by passing
`return_codebook_pixels=True`. See [`FlavaFeatureExtractor.__call__`] for details.
Pixel values. Codebook pixel values can be obtained using [`FlavaImageProcessor`] by passing
`return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import FlavaFeatureExtractor, FlavaImageCodebook
>>> from transformers import FlavaImageProcessor, FlavaImageCodebook
>>> model = FlavaImageCodebook.from_pretrained("{0}")
>>> feature_extractor = FlavaFeatureExtractor.from_pretrained("{0}")
>>> image_processor = FlavaImageProcessor.from_pretrained("{0}")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = feature_extractor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
>>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
>>> outputs = model(**inputs)
......
......@@ -916,7 +916,7 @@ GIT_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
[`CLIPImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......
......@@ -41,7 +41,6 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "GLPNConfig"
_FEAT_EXTRACTOR_FOR_DOC = "GLPNImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "vinvino02/glpn-kitti"
......@@ -503,7 +502,6 @@ class GLPNModel(GLPNPreTrainedModel):
@add_start_docstrings_to_model_forward(GLPN_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment