Unverified Commit ae454f41 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update old existing feature extractor references (#24552)

* Update old existing feature extractor references

* Typo

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Address comments from review - update 'feature extractor'
Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
parent 10c2ac7b
......@@ -8,7 +8,7 @@ from PIL import Image
from timm.models import create_model
from transformers import (
BeitFeatureExtractor,
BeitImageProcessor,
Data2VecVisionConfig,
Data2VecVisionForImageClassification,
Data2VecVisionModel,
......@@ -304,9 +304,9 @@ def main():
orig_model.eval()
# 3. Forward Beit model
feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
encoding = feature_extractor(images=image, return_tensors="pt")
encoding = image_processor(images=image, return_tensors="pt")
pixel_values = encoding["pixel_values"]
orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
......@@ -354,7 +354,7 @@ def main():
# 7. Save
print(f"Saving to {args.hf_checkpoint_name}")
hf_model.save_pretrained(args.hf_checkpoint_name)
feature_extractor.save_pretrained(args.hf_checkpoint_name)
image_processor.save_pretrained(args.hf_checkpoint_name)
if __name__ == "__main__":
......
......@@ -24,7 +24,7 @@ import torch
from huggingface_hub import cached_download, hf_hub_url
from PIL import Image
from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection
from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
from transformers.utils import logging
......@@ -115,12 +115,12 @@ def convert_deformable_detr_checkpoint(
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
# load feature extractor
feature_extractor = DeformableDetrFeatureExtractor(format="coco_detection")
# load image processor
image_processor = DeformableDetrImageProcessor(format="coco_detection")
# prepare image
img = prepare_img()
encoding = feature_extractor(images=img, return_tensors="pt")
encoding = image_processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
logger.info("Converting model...")
......@@ -185,11 +185,11 @@ def convert_deformable_detr_checkpoint(
print("Everything ok!")
# Save model and feature extractor
logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
# Save model and image processor
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
feature_extractor.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub
if push_to_hub:
......
......@@ -25,7 +25,7 @@ import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DeiTConfig, DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
from transformers.utils import logging
......@@ -182,12 +182,12 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
model = DeiTForImageClassificationWithTeacher(config).eval()
model.load_state_dict(state_dict)
# Check outputs on an image, prepared by DeiTFeatureExtractor
# Check outputs on an image, prepared by DeiTImageProcessor
size = int(
(256 / 224) * config.image_size
) # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
feature_extractor = DeiTFeatureExtractor(size=size, crop_size=config.image_size)
encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
encoding = image_processor(images=prepare_img(), return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values)
......@@ -198,8 +198,8 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
......
......@@ -25,7 +25,7 @@ import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation
from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
from transformers.utils import logging
......@@ -201,13 +201,13 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
# load feature extractor
# load image processor
format = "coco_panoptic" if is_panoptic else "coco_detection"
feature_extractor = DetrFeatureExtractor(format=format)
image_processor = DetrImageProcessor(format=format)
# prepare image
img = prepare_img()
encoding = feature_extractor(images=img, return_tensors="pt")
encoding = image_processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
logger.info(f"Converting model {model_name}...")
......@@ -258,11 +258,11 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
if is_panoptic:
assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
# Save model and feature extractor
logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
# Save model and image processor
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
feature_extractor.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
......
......@@ -1341,8 +1341,7 @@ class DetrImageProcessor(BaseImageProcessor):
Args:
results (`List[Dict]`):
Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks" results will be
added.
Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
......
......@@ -24,7 +24,7 @@ import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import BeitConfig, BeitFeatureExtractor, BeitForImageClassification, BeitForMaskedImageModeling
from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
from transformers.image_utils import PILImageResampling
from transformers.utils import logging
......@@ -171,12 +171,12 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
model.load_state_dict(state_dict)
# Check outputs on an image
feature_extractor = BeitFeatureExtractor(
image_processor = BeitImageProcessor(
size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
)
image = prepare_img()
encoding = feature_extractor(images=image, return_tensors="pt")
encoding = image_processor(images=image, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values)
......@@ -189,18 +189,18 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
if has_lm_head:
model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
else:
model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
feature_extractor.push_to_hub(
image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add feature extractor",
commit_message="Add image processor",
use_temp_dir=True,
)
model.push_to_hub(
......
......@@ -21,7 +21,7 @@ from datasets import load_dataset
from donut import DonutModel
from transformers import (
DonutFeatureExtractor,
DonutImageProcessor,
DonutProcessor,
DonutSwinConfig,
DonutSwinModel,
......@@ -152,10 +152,10 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
image = dataset["test"][0]["image"].convert("RGB")
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
feature_extractor = DonutFeatureExtractor(
image_processor = DonutImageProcessor(
do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
)
processor = DonutProcessor(feature_extractor, tokenizer)
processor = DonutProcessor(image_processor, tokenizer)
pixel_values = processor(image, return_tensors="pt").pixel_values
if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
......
......@@ -24,7 +24,7 @@ import torch
from huggingface_hub import cached_download, hf_hub_url
from PIL import Image
from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging
......@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
# Check outputs on an image
size = 480 if "ade" in checkpoint_url else 384
feature_extractor = DPTFeatureExtractor(size=size)
image_processor = DPTImageProcessor(size=size)
image = prepare_img()
encoding = feature_extractor(image, return_tensors="pt")
encoding = image_processor(image, return_tensors="pt")
# forward pass
outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
......@@ -271,12 +271,12 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model.push_to_hub("ybelkada/dpt-hybrid-midas")
feature_extractor.push_to_hub("ybelkada/dpt-hybrid-midas")
image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
if __name__ == "__main__":
......
......@@ -24,7 +24,7 @@ import torch
from huggingface_hub import cached_download, hf_hub_url
from PIL import Image
from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging
......@@ -211,10 +211,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
# Check outputs on an image
size = 480 if "ade" in checkpoint_url else 384
feature_extractor = DPTFeatureExtractor(size=size)
image_processor = DPTImageProcessor(size=size)
image = prepare_img()
encoding = feature_extractor(image, return_tensors="pt")
encoding = image_processor(image, return_tensors="pt")
# forward pass
outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
......@@ -233,8 +233,8 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model to hub...")
......@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
commit_message="Add model",
use_temp_dir=True,
)
feature_extractor.push_to_hub(
image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add feature extractor",
commit_message="Add image processor",
use_temp_dir=True,
)
......
......@@ -208,7 +208,7 @@ def convert_efficientformer_checkpoint(
)
processor.push_to_hub(
repo_id=f"Bearnardd/{pytorch_dump_path}",
commit_message="Add feature extractor",
commit_message="Add image processor",
use_temp_dir=True,
)
......@@ -234,12 +234,12 @@ if __name__ == "__main__":
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
parser.add_argument(
"--no-push_to_hub",
dest="push_to_hub",
action="store_false",
help="Do not push model and feature extractor to the hub",
help="Do not push model and image processor to the hub",
)
parser.set_defaults(push_to_hub=True)
......
......@@ -537,8 +537,8 @@ EFFICIENTFORMER_START_DOCSTRING = r"""
EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
[`ViTFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
[`ViTImageProcessor.preprocess`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
......
......@@ -305,12 +305,12 @@ def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_m
# Create folder to save model
if not os.path.isdir(pytorch_dump_folder_path):
os.mkdir(pytorch_dump_folder_path)
# Save converted model and feature extractor
# Save converted model and image processor
hf_model.save_pretrained(pytorch_dump_folder_path)
preprocessor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
# Push model and feature extractor to hub
# Push model and image processor to hub
print(f"Pushing converted {model_name} to the hub...")
model_name = f"efficientnet-{model_name}"
preprocessor.push_to_hub(model_name)
......@@ -333,7 +333,7 @@ if __name__ == "__main__":
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--save_model", action="store_true", help="Save model to local")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
args = parser.parse_args()
convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
......@@ -23,7 +23,7 @@ import requests
import torch
from PIL import Image
from transformers import GLPNConfig, GLPNFeatureExtractor, GLPNForDepthEstimation
from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
from transformers.utils import logging
......@@ -131,12 +131,12 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
# load GLPN configuration (Segformer-B4 size)
config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])
# load feature extractor (only resize + rescale)
feature_extractor = GLPNFeatureExtractor()
# load image processor (only resize + rescale)
image_processor = GLPNImageProcessor()
# prepare image
image = prepare_img()
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
logger.info("Converting model...")
......@@ -179,17 +179,17 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
# finally, push to hub if required
if push_to_hub:
logger.info("Pushing model and feature extractor to the hub...")
logger.info("Pushing model and image processor to the hub...")
model.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add model",
use_temp_dir=True,
)
feature_extractor.push_to_hub(
image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add feature extractor",
commit_message="Add image processor",
use_temp_dir=True,
)
......
......@@ -458,7 +458,7 @@ class GroupViTOnnxConfig(OnnxConfig):
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
)
image_input_dict = super().generate_dummy_inputs(
processor.feature_extractor, batch_size=batch_size, framework=framework
processor.image_processor, batch_size=batch_size, framework=framework
)
return {**text_input_dict, **image_input_dict}
......
......@@ -81,7 +81,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
def __init__(
self,
# clusters is a first argument to maintain backwards compatibility with the old ImageGPTFeatureExtractor
# clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
do_resize: bool = True,
size: Dict[str, int] = None,
......
......@@ -260,7 +260,7 @@ class LayoutLMv3OnnxConfig(OnnxConfig):
"""
# A dummy image is used so OCR should not be applied
setattr(processor.feature_extractor, "apply_ocr", False)
setattr(processor.image_processor, "apply_ocr", False)
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
batch_size = compute_effective_axis_dimension(
......
......@@ -15,6 +15,7 @@
"""
Processor class for LayoutXLM.
"""
import warnings
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
......@@ -24,26 +25,45 @@ from ...utils import TensorType
class LayoutXLMProcessor(ProcessorMixin):
r"""
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
single processor.
Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
processor.
[`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
to get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
[`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
feature_extractor (`LayoutLMv2FeatureExtractor`):
An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
image_processor (`LayoutLMv2ImageProcessor`):
An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
"""
feature_extractor_class = "LayoutLMv2FeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv2ImageProcessor"
tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__(
self,
images,
......@@ -68,37 +88,37 @@ class LayoutXLMProcessor(ProcessorMixin):
**kwargs,
) -> BatchEncoding:
"""
This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
[`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
[`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
Please refer to the docstring of the above two methods for more information.
"""
# verify input
if self.feature_extractor.apply_ocr and (boxes is not None):
if self.image_processor.apply_ocr and (boxes is not None):
raise ValueError(
"You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True."
"if you initialized the image processor with apply_ocr set to True."
)
if self.feature_extractor.apply_ocr and (word_labels is not None):
if self.image_processor.apply_ocr and (word_labels is not None):
raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
"You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
)
if return_overflowing_tokens is True and return_offsets_mapping is False:
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
# first, apply the image processor
features = self.image_processor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
if text is not None and self.image_processor.apply_ocr and text_pair is None:
if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
text = [text] # add batch dimension (as the image processor always adds a batch dimension)
text_pair = features["words"]
encoded_inputs = self.tokenizer(
......@@ -162,3 +182,19 @@ class LayoutXLMProcessor(ProcessorMixin):
@property
def model_input_names(self):
return ["input_ids", "bbox", "attention_mask", "image"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
......@@ -25,7 +25,7 @@ import timm
import torch
from huggingface_hub import hf_hub_download
from transformers import LevitConfig, LevitFeatureExtractor, LevitForImageClassificationWithTeacher
from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor
from transformers.utils import logging
......@@ -74,8 +74,8 @@ def convert_weight_and_push(
if push_to_hub:
our_model.save_pretrained(save_directory / checkpoint_name)
feature_extractor = LevitFeatureExtractor()
feature_extractor.save_pretrained(save_directory / checkpoint_name)
image_processor = LevitImageProcessor()
image_processor.save_pretrained(save_directory / checkpoint_name)
print(f"Pushed {checkpoint_name}")
......@@ -167,12 +167,12 @@ if __name__ == "__main__":
required=False,
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
parser.add_argument(
"--no-push_to_hub",
dest="push_to_hub",
action="store_false",
help="Do not push model and feature extractor to the hub",
help="Do not push model and image processor to the hub",
)
args = parser.parse_args()
......
......@@ -192,7 +192,7 @@ class OriginalMask2FormerConfigToOursConverter:
return config
class OriginalMask2FormerConfigToFeatureExtractorConverter:
class OriginalMask2FormerConfigToImageProcessorConverter:
def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
model = original_config.MODEL
model_input = original_config.INPUT
......@@ -846,7 +846,7 @@ class OriginalMask2FormerCheckpointToOursConverter:
def test(
original_model,
our_model: Mask2FormerForUniversalSegmentation,
feature_extractor: Mask2FormerImageProcessor,
image_processor: Mask2FormerImageProcessor,
tolerance: float,
):
with torch.no_grad():
......@@ -854,7 +854,7 @@ def test(
our_model = our_model.eval()
im = prepare_img()
x = feature_extractor(images=im, return_tensors="pt")["pixel_values"]
x = image_processor(images=im, return_tensors="pt")["pixel_values"]
original_model_backbone_features = original_model.backbone(x.clone())
our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
......@@ -979,10 +979,10 @@ if __name__ == "__main__":
checkpoints_dir, config_dir
):
model_name = get_model_name(checkpoint_file)
feature_extractor = OriginalMask2FormerConfigToFeatureExtractorConverter()(
image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
setup_cfg(Args(config_file=config_file))
)
feature_extractor.size = {"height": 384, "width": 384}
image_processor.size = {"height": 384, "width": 384}
original_config = setup_cfg(Args(config_file=config_file))
mask2former_kwargs = OriginalMask2Former.from_config(original_config)
......@@ -1012,8 +1012,8 @@ if __name__ == "__main__":
tolerance = 3e-1
logger.info(f"🪄 Testing {model_name}...")
test(original_model, mask2former_for_segmentation, feature_extractor, tolerance)
test(original_model, mask2former_for_segmentation, image_processor, tolerance)
logger.info(f"🪄 Pushing {model_name} to hub...")
feature_extractor.push_to_hub(model_name)
image_processor.push_to_hub(model_name)
mask2former_for_segmentation.push_to_hub(model_name)
......@@ -2106,8 +2106,8 @@ MASK2FORMER_START_DOCSTRING = r"""
MASK2FORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
[`AutoFeatureExtractor.__call__`] for details.
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.preprocess`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment