"lightx2v/text2v/vscode:/vscode.git/clone" did not exist on "a843121df39831babf524f62702604645e4b012b"
Unverified Commit 17a7b49b authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update doc examples feature extractor -> image processor (#20501)

* Update doc example feature extractor -> image processor

* Apply suggestions from code review
parent afad0c18
......@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "PoolFormerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "PoolFormerFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "PoolFormerImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "sail/poolformer_s12"
......@@ -302,8 +302,8 @@ POOLFORMER_START_DOCSTRING = r"""
POOLFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`PoolFormerFeatureExtractor`]. See
[`PoolFormerFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`PoolFormerImageProcessor`]. See
[`PoolFormerImageProcessor.__call__`] for details.
"""
......
......@@ -37,7 +37,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "RegNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
......@@ -313,8 +313,8 @@ REGNET_START_DOCSTRING = r"""
REGNET_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
......
......@@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "RegNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
......@@ -389,8 +389,8 @@ REGNET_START_DOCSTRING = r"""
REGNET_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
......
......@@ -43,7 +43,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ResNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
......@@ -285,8 +285,8 @@ RESNET_START_DOCSTRING = r"""
RESNET_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
......
......@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ResNetConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
......@@ -313,8 +313,8 @@ RESNET_START_DOCSTRING = r"""
RESNET_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
......
......@@ -42,7 +42,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SegformerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
......@@ -491,7 +491,7 @@ SEGFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`SegformerFeatureExtractor`]. See [`SegformerFeatureExtractor.__call__`] for details.
[`SegformerImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
......@@ -772,17 +772,17 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
Examples:
```python
>>> from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
>>> from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
>>> from PIL import Image
>>> import requests
>>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> image_processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4)
>>> list(logits.shape)
......
......@@ -37,7 +37,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SegformerConfig"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "SegformerImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
......@@ -568,8 +568,8 @@ SEGFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
......@@ -835,17 +835,17 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
Examples:
```python
>>> from transformers import SegformerFeatureExtractor, TFSegformerForSemanticSegmentation
>>> from transformers import SegformerImageProcessor, TFSegformerForSemanticSegmentation
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> image_processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
>>> inputs = feature_extractor(images=image, return_tensors="tf")
>>> inputs = image_processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs, training=False)
>>> # logits are of shape (batch_size, num_labels, height/4, width/4)
>>> logits = outputs.logits
......
......@@ -43,7 +43,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
......@@ -888,8 +888,8 @@ SWIN_START_DOCSTRING = r"""
SWIN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......@@ -1053,7 +1053,7 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
Examples:
```python
>>> from transformers import AutoFeatureExtractor, SwinForMaskedImageModeling
>>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests
......@@ -1061,11 +1061,11 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-base-simmim-window6-192")
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
>>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
......
......@@ -47,7 +47,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SwinConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"
......@@ -985,8 +985,8 @@ SWIN_START_DOCSTRING = r"""
SWIN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......@@ -1321,7 +1321,7 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
Examples:
```python
>>> from transformers import AutoFeatureExtractor, TFSwinForMaskedImageModeling
>>> from transformers import AutoImageProcessor, TFSwinForMaskedImageModeling
>>> import tensorflow as tf
>>> from PIL import Image
>>> import requests
......@@ -1329,11 +1329,11 @@ class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
>>> model = TFSwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values
>>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5
......
......@@ -44,7 +44,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "Swinv2Config"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
......@@ -968,8 +968,8 @@ SWINV2_START_DOCSTRING = r"""
SWINV2_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......@@ -1136,7 +1136,7 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
Examples:
```python
>>> from transformers import AutoFeatureExtractor, Swinv2ForMaskedImageModeling
>>> from transformers import AutoImageProcessor, Swinv2ForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests
......@@ -1144,11 +1144,11 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> model = Swinv2ForMaskedImageModeling.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
......
......@@ -136,7 +136,7 @@ class TableTransformerModelOutput(Seq2SeqModelOutput):
@dataclass
# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->TableTransformer,DetrFeatureExtractor->DetrFeatureExtractor
# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->TableTransformer,DetrImageProcessor->DetrImageProcessor
class TableTransformerObjectDetectionOutput(ModelOutput):
"""
Output type of [`TableTransformerForObjectDetection`].
......@@ -153,7 +153,7 @@ class TableTransformerObjectDetectionOutput(ModelOutput):
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~TableTransformerFeatureExtractor.post_process_object_detection`] to
possible padding). You can use [`~TableTransformerImageProcessor.post_process_object_detection`] to
retrieve the unnormalized bounding boxes.
auxiliary_outputs (`list[Dict]`, *optional*):
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
......@@ -797,8 +797,7 @@ TABLE_TRANSFORMER_INPUTS_DOCSTRING = r"""
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it.
Pixel values can be obtained using [`DetrFeatureExtractor`]. See [`DetrFeatureExtractor.__call__`] for
details.
Pixel values can be obtained using [`DetrImageProcessor`]. See [`DetrImageProcessor.__call__`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
......@@ -1188,18 +1187,18 @@ class TableTransformerModel(TableTransformerPreTrainedModel):
Examples:
```python
>>> from transformers import AutoFeatureExtractor, TableTransformerModel
>>> from transformers import AutoImageProcessor, TableTransformerModel
>>> from huggingface_hub import hf_hub_download
>>> from PIL import Image
>>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
>>> image = Image.open(file_path).convert("RGB")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection")
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
>>> model = TableTransformerModel.from_pretrained("microsoft/table-transformer-detection")
>>> # prepare image for the model
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> # forward pass
>>> outputs = model(**inputs)
......@@ -1357,24 +1356,24 @@ class TableTransformerForObjectDetection(TableTransformerPreTrainedModel):
```python
>>> from huggingface_hub import hf_hub_download
>>> from transformers import AutoFeatureExtractor, TableTransformerForObjectDetection
>>> from transformers import AutoImageProcessor, TableTransformerForObjectDetection
>>> import torch
>>> from PIL import Image
>>> file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
>>> image = Image.open(file_path).convert("RGB")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection")
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
>>> model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API
>>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = feature_extractor.post_process_object_detection(
... outputs, threshold=0.9, target_sizes=target_sizes
... )[0]
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
... 0
... ]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()]
......
......@@ -23,7 +23,7 @@ from ...processing_utils import ProcessorMixin
class TrOCRProcessor(ProcessorMixin):
r"""
Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor.
Constructs a TrOCR processor which wraps a vision image processor and a TrOCR tokenizer into a single processor.
[`TrOCRProcessor`] offers all the functionalities of [`ViTImageProcessor`/`DeiTImageProcessor`] and
[`RobertaTokenizer`/`XLMRobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and [`~TrOCRProcessor.decode`] for
......
......@@ -38,7 +38,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "VanConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
......@@ -407,8 +407,8 @@ VAN_START_DOCSTRING = r"""
VAN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
......
......@@ -510,8 +510,8 @@ VIDEOMAE_START_DOCSTRING = r"""
VIDEOMAE_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`VideoMAEFeatureExtractor`]. See
[`VideoMAEFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`VideoMAEImageProcessor`]. See
[`VideoMAEImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......@@ -581,7 +581,7 @@ class VideoMAEModel(VideoMAEPreTrainedModel):
>>> from decord import VideoReader, cpu
>>> import numpy as np
>>> from transformers import VideoMAEFeatureExtractor, VideoMAEModel
>>> from transformers import VideoMAEImageProcessor, VideoMAEModel
>>> from huggingface_hub import hf_hub_download
......@@ -605,11 +605,11 @@ class VideoMAEModel(VideoMAEPreTrainedModel):
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy()
>>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
>>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
>>> # prepare video for the model
>>> inputs = feature_extractor(list(video), return_tensors="pt")
>>> inputs = image_processor(list(video), return_tensors="pt")
>>> # forward pass
>>> outputs = model(**inputs)
......@@ -765,17 +765,17 @@ class VideoMAEForPreTraining(VideoMAEPreTrainedModel):
Examples:
```python
>>> from transformers import VideoMAEFeatureExtractor, VideoMAEForPreTraining
>>> from transformers import VideoMAEImageProcessor, VideoMAEForPreTraining
>>> import numpy as np
>>> import torch
>>> num_frames = 16
>>> video = list(np.random.randn(16, 3, 224, 224))
>>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
>>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")
>>> pixel_values = feature_extractor(video, return_tensors="pt").pixel_values
>>> pixel_values = image_processor(video, return_tensors="pt").pixel_values
>>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
>>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
......@@ -942,7 +942,7 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
>>> import torch
>>> import numpy as np
>>> from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
>>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
......@@ -968,10 +968,10 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
>>> video = videoreader.get_batch(indices).asnumpy()
>>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> inputs = feature_extractor(list(video), return_tensors="pt")
>>> inputs = image_processor(list(video), return_tensors="pt")
>>> with torch.no_grad():
... outputs = model(**inputs)
......
......@@ -86,8 +86,8 @@ VISION_ENCODER_DECODER_START_DOCSTRING = r"""
VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using the vision model's feature extractor. For example, using
[`ViTFeatureExtractor`]. See [`ViTFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
[`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
......@@ -114,8 +114,8 @@ VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
VISION_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using the vision model's feature extractor. For example, using
[`ViTFeatureExtractor`]. See [`ViTFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
[`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......@@ -409,21 +409,21 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
Example:
```python
>>> from transformers import ViTFeatureExtractor, FlaxVisionEncoderDecoderModel
>>> from transformers import ViTImageProcessor, FlaxVisionEncoderDecoderModel
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "gpt2"
... )
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
>>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......@@ -487,7 +487,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
Example:
```python
>>> from transformers import ViTFeatureExtractor, FlaxVisionEncoderDecoderModel
>>> from transformers import ViTImageProcessor, FlaxVisionEncoderDecoderModel
>>> import jax.numpy as jnp
>>> from PIL import Image
>>> import requests
......@@ -495,14 +495,14 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "gpt2"
... )
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
>>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values)
>>> decoder_start_token_id = model.config.decoder.bos_token_id
......@@ -617,14 +617,14 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
Examples:
```python
>>> from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, GPT2Tokenizer
>>> from transformers import FlaxVisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # load output tokenizer
>>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
......@@ -634,7 +634,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
... "google/vit-base-patch16-224-in21k", "gpt2"
... )
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
>>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
>>> # use GPT2's eos_token as the pad as well as eos token
>>> model.config.eos_token_id = model.config.decoder.eos_token_id
......
......@@ -106,8 +106,8 @@ VISION_TEXT_DUAL_ENCODER_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
a feature extractor (e.g. if you use ViT as the encoder, you should use [`ViTFeatureExtractor`]). See
[`ViTFeatureExtractor.__call__`] for details.
an image processor (e.g. if you use ViT as the encoder, you should use [`ViTImageProcessor`]). See
[`ViTImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......
......@@ -70,8 +70,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
[`ViTFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
......@@ -565,17 +565,17 @@ FLAX_VISION_MODEL_DOCSTRING = """
Examples:
```python
>>> from transformers import ViTFeatureExtractor, FlaxViTModel
>>> from transformers import ViTImageProcessor, FlaxViTModel
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="np")
>>> inputs = image_processor(images=image, return_tensors="np")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```
......@@ -648,7 +648,7 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
Example:
```python
>>> from transformers import ViTFeatureExtractor, FlaxViTForImageClassification
>>> from transformers import ViTImageProcessor, FlaxViTForImageClassification
>>> from PIL import Image
>>> import jax
>>> import requests
......@@ -656,10 +656,10 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
>>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="np")
>>> inputs = image_processor(images=image, return_tensors="np")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
......
......@@ -41,7 +41,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
......@@ -629,8 +629,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
[`ViTFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......
......@@ -42,7 +42,7 @@ logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ViTConfig"
_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor"
_FEAT_EXTRACTOR_FOR_DOC = "ViTImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
......@@ -481,8 +481,8 @@ VIT_START_DOCSTRING = r"""
VIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
[`ViTFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......@@ -664,7 +664,7 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel):
Examples:
```python
>>> from transformers import ViTFeatureExtractor, ViTForMaskedImageModeling
>>> from transformers import ViTImageProcessor, ViTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests
......@@ -672,11 +672,11 @@ class ViTForMaskedImageModeling(ViTPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")
>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
......
......@@ -770,8 +770,8 @@ VIT_MAE_START_DOCSTRING = r"""
VIT_MAE_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
......@@ -830,17 +830,17 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel):
Examples:
```python
>>> from transformers import AutoFeatureExtractor, TFViTMAEModel
>>> from transformers import AutoImageProcessor, TFViTMAEModel
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/vit-mae-base")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
>>> model = TFViTMAEModel.from_pretrained("facebook/vit-mae-base")
>>> inputs = feature_extractor(images=image, return_tensors="tf")
>>> inputs = image_processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```"""
......@@ -1121,17 +1121,17 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
Examples:
```python
>>> from transformers import AutoFeatureExtractor, TFViTMAEForPreTraining
>>> from transformers import AutoImageProcessor, TFViTMAEForPreTraining
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/vit-mae-base")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
>>> model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> loss = outputs.loss
>>> mask = outputs.mask
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment