Unverified Commit 17a7b49b authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update doc examples feature extractor -> image processor (#20501)

* Update doc example feature extractor -> image processor

* Apply suggestions from code review
parent afad0c18
...@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__) ...@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "PoolFormerConfig" _CONFIG_FOR_DOC = "PoolFormerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "PoolFormerFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "PoolFormerImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "sail/poolformer_s12" _CHECKPOINT_FOR_DOC = "sail/poolformer_s12"
...@@ -302,8 +302,8 @@ POOLFORMER_START_DOCSTRING = r""" ...@@ -302,8 +302,8 @@ POOLFORMER_START_DOCSTRING = r"""
POOLFORMER_INPUTS_DOCSTRING = r""" POOLFORMER_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`PoolFormerFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`PoolFormerImageProcessor`]. See
[`PoolFormerFeatureExtractor.__call__`] for details. [`PoolFormerImageProcessor.__call__`] for details.
""" """
......
...@@ -37,7 +37,7 @@ logger = logging.get_logger(__name__) ...@@ -37,7 +37,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "RegNetConfig" _CONFIG_FOR_DOC = "RegNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040" _CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
...@@ -313,8 +313,8 @@ REGNET_START_DOCSTRING = r""" ...@@ -313,8 +313,8 @@ REGNET_START_DOCSTRING = r"""
REGNET_INPUTS_DOCSTRING = r""" REGNET_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*): output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
......
...@@ -35,7 +35,7 @@ logger = logging.get_logger(__name__) ...@@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "RegNetConfig" _CONFIG_FOR_DOC = "RegNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040" _CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
...@@ -389,8 +389,8 @@ REGNET_START_DOCSTRING = r""" ...@@ -389,8 +389,8 @@ REGNET_START_DOCSTRING = r"""
REGNET_INPUTS_DOCSTRING = r""" REGNET_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*): output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. more detail.
......
...@@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) ...@@ -43,7 +43,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "ResNetConfig" _CONFIG_FOR_DOC = "ResNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50" _CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
...@@ -285,8 +285,8 @@ RESNET_START_DOCSTRING = r""" ...@@ -285,8 +285,8 @@ RESNET_START_DOCSTRING = r"""
RESNET_INPUTS_DOCSTRING = r""" RESNET_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*): output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
......
...@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__) ...@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "ResNetConfig" _CONFIG_FOR_DOC = "ResNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50" _CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
...@@ -313,8 +313,8 @@ RESNET_START_DOCSTRING = r""" ...@@ -313,8 +313,8 @@ RESNET_START_DOCSTRING = r"""
RESNET_INPUTS_DOCSTRING = r""" RESNET_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*): output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
......
...@@ -42,7 +42,7 @@ logger = logging.get_logger(__name__) ...@@ -42,7 +42,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "SegformerConfig" _CONFIG_FOR_DOC = "SegformerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0" _CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
...@@ -491,7 +491,7 @@ SEGFORMER_INPUTS_DOCSTRING = r""" ...@@ -491,7 +491,7 @@ SEGFORMER_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`SegformerFeatureExtractor`]. See [`SegformerFeatureExtractor.__call__`] for details. [`SegformerImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
...@@ -772,17 +772,17 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel): ...@@ -772,17 +772,17 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation >>> from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") >>> image_processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") >>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = feature_extractor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4) >>> logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4)
>>> list(logits.shape) >>> list(logits.shape)
......
...@@ -37,7 +37,7 @@ logger = logging.get_logger(__name__) ...@@ -37,7 +37,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "SegformerConfig" _CONFIG_FOR_DOC = "SegformerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0" _CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
...@@ -568,8 +568,8 @@ SEGFORMER_INPUTS_DOCSTRING = r""" ...@@ -568,8 +568,8 @@ SEGFORMER_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
...@@ -835,17 +835,17 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel): ...@@ -835,17 +835,17 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import SegformerFeatureExtractor, TFSegformerForSemanticSegmentation >>> from transformers import SegformerImageProcessor, TFSegformerForSemanticSegmentation
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") >>> image_processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> inputs = feature_extractor(images=image, return_tensors="tf") >>> inputs = image_processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs, training=False) >>> outputs = model(**inputs, training=False)
>>> # logits are of shape (batch_size, num_labels, height/4, width/4) >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
>>> logits = outputs.logits >>> logits = outputs.logits
......
...@@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) ...@@ -43,7 +43,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "SwinConfig" _CONFIG_FOR_DOC = "SwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224" _CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
...@@ -888,8 +888,8 @@ SWIN_START_DOCSTRING = r""" ...@@ -888,8 +888,8 @@ SWIN_START_DOCSTRING = r"""
SWIN_INPUTS_DOCSTRING = r""" SWIN_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
...@@ -1053,7 +1053,7 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel): ...@@ -1053,7 +1053,7 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import AutoFeatureExtractor, SwinForMaskedImageModeling >>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
>>> import torch >>> import torch
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
...@@ -1061,11 +1061,11 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel): ...@@ -1061,11 +1061,11 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-base-simmim-window6-192") >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
>>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192") >>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2 >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches) >>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool() >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
......
...@@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) ...@@ -47,7 +47,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "SwinConfig" _CONFIG_FOR_DOC = "SwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224" _CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
...@@ -985,8 +985,8 @@ SWIN_START_DOCSTRING = r""" ...@@ -985,8 +985,8 @@ SWIN_START_DOCSTRING = r"""
SWIN_INPUTS_DOCSTRING = r""" SWIN_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
...@@ -1321,7 +1321,7 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel): ...@@ -1321,7 +1321,7 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import AutoFeatureExtractor, TFSwinForMaskedImageModeling >>> from transformers import AutoImageProcessor, TFSwinForMaskedImageModeling
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
...@@ -1329,11 +1329,11 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel): ...@@ -1329,11 +1329,11 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
>>> model = TFSwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224") >>> model = TFSwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2 >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values >>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches) >>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5 >>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5
......
...@@ -44,7 +44,7 @@ logger = logging.get_logger(__name__) ...@@ -44,7 +44,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "Swinv2Config" _CONFIG_FOR_DOC = "Swinv2Config"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256" _CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
...@@ -968,8 +968,8 @@ SWINV2_START_DOCSTRING = r""" ...@@ -968,8 +968,8 @@ SWINV2_START_DOCSTRING = r"""
SWINV2_INPUTS_DOCSTRING = r""" SWINV2_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
...@@ -1136,7 +1136,7 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel): ...@@ -1136,7 +1136,7 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import AutoFeatureExtractor, Swinv2ForMaskedImageModeling >>> from transformers import AutoImageProcessor, Swinv2ForMaskedImageModeling
>>> import torch >>> import torch
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
...@@ -1144,11 +1144,11 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel): ...@@ -1144,11 +1144,11 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256") >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> model = Swinv2ForMaskedImageModeling.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256") >>> model = Swinv2ForMaskedImageModeling.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2 >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches) >>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool() >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
......
...@@ -136,7 +136,7 @@ class TableTransformerModelOutput(Seq2SeqModelOutput): ...@@ -136,7 +136,7 @@ class TableTransformerModelOutput(Seq2SeqModelOutput):
@dataclass @dataclass
# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->TableTransformer,DetrFeatureExtractor->DetrFeatureExtractor # Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->TableTransformer,DetrImageProcessor->DetrImageProcessor
class TableTransformerObjectDetectionOutput(ModelOutput): class TableTransformerObjectDetectionOutput(ModelOutput):
""" """
Output type of [`TableTransformerForObjectDetection`]. Output type of [`TableTransformerForObjectDetection`].
...@@ -153,7 +153,7 @@ class TableTransformerObjectDetectionOutput(ModelOutput): ...@@ -153,7 +153,7 @@ class TableTransformerObjectDetectionOutput(ModelOutput):
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~TableTransformerFeatureExtractor.post_process_object_detection`] to possible padding). You can use [`~TableTransformerImageProcessor.post_process_object_detection`] to
retrieve the unnormalized bounding boxes. retrieve the unnormalized bounding boxes.
auxiliary_outputs (`list[Dict]`, *optional*): auxiliary_outputs (`list[Dict]`, *optional*):
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
...@@ -797,8 +797,7 @@ TABLE_TRANSFORMER_INPUTS_DOCSTRING = r""" ...@@ -797,8 +797,7 @@ TABLE_TRANSFORMER_INPUTS_DOCSTRING = r"""
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values. Padding will be ignored by default should you provide it.
Pixel values can be obtained using [`DetrFeatureExtractor`]. See [`DetrFeatureExtractor.__call__`] for Pixel values can be obtained using [`DetrImageProcessor`]. See [`DetrImageProcessor.__call__`] for details.
details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
...@@ -1188,18 +1187,18 @@ class TableTransformerModel(TableTransformerPreTrainedModel): ...@@ -1188,18 +1187,18 @@ class TableTransformerModel(TableTransformerPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import AutoFeatureExtractor, TableTransformerModel >>> from transformers import AutoImageProcessor, TableTransformerModel
>>> from huggingface_hub import hf_hub_download >>> from huggingface_hub import hf_hub_download
>>> from PIL import Image >>> from PIL import Image
>>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png") >>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
>>> image = Image.open(file_path).convert("RGB") >>> image = Image.open(file_path).convert("RGB")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection") >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
>>> model = TableTransformerModel.from_pretrained("microsoft/table-transformer-detection") >>> model = TableTransformerModel.from_pretrained("microsoft/table-transformer-detection")
>>> # prepare image for the model >>> # prepare image for the model
>>> inputs = feature_extractor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> # forward pass >>> # forward pass
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -1357,24 +1356,24 @@ class TableTransformerForObjectDetection(TableTransformerPreTrainedModel): ...@@ -1357,24 +1356,24 @@ class TableTransformerForObjectDetection(TableTransformerPreTrainedModel):
```python ```python
>>> from huggingface_hub import hf_hub_download >>> from huggingface_hub import hf_hub_download
>>> from transformers import AutoFeatureExtractor, TableTransformerForObjectDetection >>> from transformers import AutoImageProcessor, TableTransformerForObjectDetection
>>> import torch >>> import torch
>>> from PIL import Image >>> from PIL import Image
>>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png") >>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
>>> image = Image.open(file_path).convert("RGB") >>> image = Image.open(file_path).convert("RGB")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection") >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
>>> model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection") >>> model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
>>> inputs = feature_extractor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to COCO API
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = feature_extractor.post_process_object_detection( >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
... outputs, threshold=0.9, target_sizes=target_sizes ... 0
... )[0] ... ]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()] ... box = [round(i, 2) for i in box.tolist()]
......
...@@ -23,7 +23,7 @@ from ...processing_utils import ProcessorMixin ...@@ -23,7 +23,7 @@ from ...processing_utils import ProcessorMixin
class TrOCRProcessor(ProcessorMixin): class TrOCRProcessor(ProcessorMixin):
r""" r"""
Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor. Constructs a TrOCR processor which wraps a vision image processor and a TrOCR tokenizer into a single processor.
[`TrOCRProcessor`] offers all the functionalities of [`ViTImageProcessor`/`DeiTImageProcessor`] and [`TrOCRProcessor`] offers all the functionalities of [`ViTImageProcessor`/`DeiTImageProcessor`] and
[`RobertaTokenizer`/`XLMRobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and [`~TrOCRProcessor.decode`] for [`RobertaTokenizer`/`XLMRobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and [`~TrOCRProcessor.decode`] for
......
...@@ -38,7 +38,7 @@ logger = logging.get_logger(__name__) ...@@ -38,7 +38,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "VanConfig" _CONFIG_FOR_DOC = "VanConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base" _CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
...@@ -407,8 +407,8 @@ VAN_START_DOCSTRING = r""" ...@@ -407,8 +407,8 @@ VAN_START_DOCSTRING = r"""
VAN_INPUTS_DOCSTRING = r""" VAN_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*): output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
......
...@@ -510,8 +510,8 @@ VIDEOMAE_START_DOCSTRING = r""" ...@@ -510,8 +510,8 @@ VIDEOMAE_START_DOCSTRING = r"""
VIDEOMAE_INPUTS_DOCSTRING = r""" VIDEOMAE_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`VideoMAEFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`VideoMAEImageProcessor`]. See
[`VideoMAEFeatureExtractor.__call__`] for details. [`VideoMAEImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
...@@ -581,7 +581,7 @@ class VideoMAEModel(VideoMAEPreTrainedModel): ...@@ -581,7 +581,7 @@ class VideoMAEModel(VideoMAEPreTrainedModel):
>>> from decord import VideoReader, cpu >>> from decord import VideoReader, cpu
>>> import numpy as np >>> import numpy as np
>>> from transformers import VideoMAEFeatureExtractor, VideoMAEModel >>> from transformers import VideoMAEImageProcessor, VideoMAEModel
>>> from huggingface_hub import hf_hub_download >>> from huggingface_hub import hf_hub_download
...@@ -605,11 +605,11 @@ class VideoMAEModel(VideoMAEPreTrainedModel): ...@@ -605,11 +605,11 @@ class VideoMAEModel(VideoMAEPreTrainedModel):
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader)) >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy() >>> video = videoreader.get_batch(indices).asnumpy()
>>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base") >>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base") >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
>>> # prepare video for the model >>> # prepare video for the model
>>> inputs = feature_extractor(list(video), return_tensors="pt") >>> inputs = image_processor(list(video), return_tensors="pt")
>>> # forward pass >>> # forward pass
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -765,17 +765,17 @@ class VideoMAEForPreTraining(VideoMAEPreTrainedModel): ...@@ -765,17 +765,17 @@ class VideoMAEForPreTraining(VideoMAEPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import VideoMAEFeatureExtractor, VideoMAEForPreTraining >>> from transformers import VideoMAEImageProcessor, VideoMAEForPreTraining
>>> import numpy as np >>> import numpy as np
>>> import torch >>> import torch
>>> num_frames = 16 >>> num_frames = 16
>>> video = list(np.random.randn(16, 3, 224, 224)) >>> video = list(np.random.randn(16, 3, 224, 224))
>>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base") >>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base") >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")
>>> pixel_values = feature_extractor(video, return_tensors="pt").pixel_values >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values
>>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2 >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
>>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
...@@ -942,7 +942,7 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel): ...@@ -942,7 +942,7 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
>>> import torch >>> import torch
>>> import numpy as np >>> import numpy as np
>>> from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification >>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
>>> from huggingface_hub import hf_hub_download >>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0) >>> np.random.seed(0)
...@@ -968,10 +968,10 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel): ...@@ -968,10 +968,10 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader)) >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy() >>> video = videoreader.get_batch(indices).asnumpy()
>>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") >>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> inputs = feature_extractor(list(video), return_tensors="pt") >>> inputs = image_processor(list(video), return_tensors="pt")
>>> with torch.no_grad(): >>> with torch.no_grad():
... outputs = model(**inputs) ... outputs = model(**inputs)
......
...@@ -86,8 +86,8 @@ VISION_ENCODER_DECODER_START_DOCSTRING = r""" ...@@ -86,8 +86,8 @@ VISION_ENCODER_DECODER_START_DOCSTRING = r"""
VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r""" VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using the vision model's feature extractor. For example, using Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
[`ViTFeatureExtractor`]. See [`ViTFeatureExtractor.__call__`] for details. [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
...@@ -114,8 +114,8 @@ VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r""" ...@@ -114,8 +114,8 @@ VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
VISION_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r""" VISION_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using the vision model's feature extractor. For example, using Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
[`ViTFeatureExtractor`]. See [`ViTFeatureExtractor.__call__`] for details. [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. tensors for more detail.
...@@ -409,21 +409,21 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -409,21 +409,21 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
Example: Example:
```python ```python
>>> from transformers import ViTFeatureExtractor, FlaxVisionEncoderDecoderModel >>> from transformers import ViTImageProcessor, FlaxVisionEncoderDecoderModel
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained( >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "gpt2" ... "google/vit-base-patch16-224-in21k", "gpt2"
... ) ... )
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values) >>> encoder_outputs = model.encode(pixel_values)
```""" ```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
...@@ -487,7 +487,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -487,7 +487,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
Example: Example:
```python ```python
>>> from transformers import ViTFeatureExtractor, FlaxVisionEncoderDecoderModel >>> from transformers import ViTImageProcessor, FlaxVisionEncoderDecoderModel
>>> import jax.numpy as jnp >>> import jax.numpy as jnp
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
...@@ -495,14 +495,14 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -495,14 +495,14 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained( >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "gpt2" ... "google/vit-base-patch16-224-in21k", "gpt2"
... ) ... )
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values) >>> encoder_outputs = model.encode(pixel_values)
>>> decoder_start_token_id = model.config.decoder.bos_token_id >>> decoder_start_token_id = model.config.decoder.bos_token_id
...@@ -617,14 +617,14 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -617,14 +617,14 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, GPT2Tokenizer >>> from transformers import FlaxVisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # load output tokenizer >>> # load output tokenizer
>>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2") >>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
...@@ -634,7 +634,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -634,7 +634,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
... "google/vit-base-patch16-224-in21k", "gpt2" ... "google/vit-base-patch16-224-in21k", "gpt2"
... ) ... )
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
>>> # use GPT2's eos_token as the pad as well as eos token >>> # use GPT2's eos_token as the pad as well as eos token
>>> model.config.eos_token_id = model.config.decoder.eos_token_id >>> model.config.eos_token_id = model.config.decoder.eos_token_id
......
...@@ -106,8 +106,8 @@ VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING = r""" ...@@ -106,8 +106,8 @@ VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids) [What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
a feature extractor (e.g. if you use ViT as the encoder, you should use [`ViTFeatureExtractor`]). See an image processor (e.g. if you use ViT as the encoder, you should use [`ViTImageProcessor`]). See
[`ViTFeatureExtractor.__call__`] for details. [`ViTImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. tensors for more detail.
......
...@@ -70,8 +70,8 @@ VIT_START_DOCSTRING = r""" ...@@ -70,8 +70,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r""" VIT_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`): pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
[`ViTFeatureExtractor.__call__`] for details. for details.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
...@@ -565,17 +565,17 @@ FLAX_VISION_MODEL_DOCSTRING = """ ...@@ -565,17 +565,17 @@ FLAX_VISION_MODEL_DOCSTRING = """
Examples: Examples:
```python ```python
>>> from transformers import ViTFeatureExtractor, FlaxViTModel >>> from transformers import ViTImageProcessor, FlaxViTModel
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k") >>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="np") >>> inputs = image_processor(images=image, return_tensors="np")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state >>> last_hidden_states = outputs.last_hidden_state
``` ```
...@@ -648,7 +648,7 @@ FLAX_VISION_CLASSIF_DOCSTRING = """ ...@@ -648,7 +648,7 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
Example: Example:
```python ```python
>>> from transformers import ViTFeatureExtractor, FlaxViTForImageClassification >>> from transformers import ViTImageProcessor, FlaxViTForImageClassification
>>> from PIL import Image >>> from PIL import Image
>>> import jax >>> import jax
>>> import requests >>> import requests
...@@ -656,10 +656,10 @@ FLAX_VISION_CLASSIF_DOCSTRING = """ ...@@ -656,10 +656,10 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
>>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224") >>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="np") >>> inputs = image_processor(images=image, return_tensors="np")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> logits = outputs.logits >>> logits = outputs.logits
......
...@@ -41,7 +41,7 @@ logger = logging.get_logger(__name__) ...@@ -41,7 +41,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "ViTConfig" _CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k" _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
...@@ -629,8 +629,8 @@ VIT_START_DOCSTRING = r""" ...@@ -629,8 +629,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r""" VIT_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
[`ViTFeatureExtractor.__call__`] for details. for details.
head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......
...@@ -42,7 +42,7 @@ logger = logging.get_logger(__name__) ...@@ -42,7 +42,7 @@ logger = logging.get_logger(__name__)
# General docstring # General docstring
_CONFIG_FOR_DOC = "ViTConfig" _CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor" _FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring # Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k" _CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
...@@ -481,8 +481,8 @@ VIT_START_DOCSTRING = r""" ...@@ -481,8 +481,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r""" VIT_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
[`ViTFeatureExtractor.__call__`] for details. for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
...@@ -664,7 +664,7 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel): ...@@ -664,7 +664,7 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import ViTFeatureExtractor, ViTForMaskedImageModeling >>> from transformers import ViTImageProcessor, ViTForMaskedImageModeling
>>> import torch >>> import torch
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
...@@ -672,11 +672,11 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel): ...@@ -672,11 +672,11 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k") >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2 >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches) >>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool() >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
......
...@@ -770,8 +770,8 @@ VIT_MAE_START_DOCSTRING = r""" ...@@ -770,8 +770,8 @@ VIT_MAE_START_DOCSTRING = r"""
VIT_MAE_INPUTS_DOCSTRING = r""" VIT_MAE_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.__call__`] for details.
head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
...@@ -830,17 +830,17 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel): ...@@ -830,17 +830,17 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import AutoFeatureExtractor, TFViTMAEModel >>> from transformers import AutoImageProcessor, TFViTMAEModel
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/vit-mae-base") >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
>>> model = TFViTMAEModel.from_pretrained("facebook/vit-mae-base") >>> model = TFViTMAEModel.from_pretrained("facebook/vit-mae-base")
>>> inputs = feature_extractor(images=image, return_tensors="tf") >>> inputs = image_processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state >>> last_hidden_states = outputs.last_hidden_state
```""" ```"""
...@@ -1121,17 +1121,17 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel): ...@@ -1121,17 +1121,17 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import AutoFeatureExtractor, TFViTMAEForPreTraining >>> from transformers import AutoImageProcessor, TFViTMAEForPreTraining
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/vit-mae-base") >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
>>> model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base") >>> model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
>>> inputs = feature_extractor(images=image, return_tensors="pt") >>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> loss = outputs.loss >>> loss = outputs.loss
>>> mask = outputs.mask >>> mask = outputs.mask
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment