Unverified Commit ae454f41 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update old existing feature extractor references (#24552)

* Update old existing feature extractor references

* Typo

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Address comments from review - update 'feature extractor'
Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
parent 10c2ac7b
...@@ -8,7 +8,7 @@ from PIL import Image ...@@ -8,7 +8,7 @@ from PIL import Image
from timm.models import create_model from timm.models import create_model
from transformers import ( from transformers import (
BeitFeatureExtractor, BeitImageProcessor,
Data2VecVisionConfig, Data2VecVisionConfig,
Data2VecVisionForImageClassification, Data2VecVisionForImageClassification,
Data2VecVisionModel, Data2VecVisionModel,
...@@ -304,9 +304,9 @@ def main(): ...@@ -304,9 +304,9 @@ def main():
orig_model.eval() orig_model.eval()
# 3. Forward Beit model # 3. Forward Beit model
feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False) image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png") image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
encoding = feature_extractor(images=image, return_tensors="pt") encoding = image_processor(images=image, return_tensors="pt")
pixel_values = encoding["pixel_values"] pixel_values = encoding["pixel_values"]
orig_args = (pixel_values,) if is_finetuned else (pixel_values, None) orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
...@@ -354,7 +354,7 @@ def main(): ...@@ -354,7 +354,7 @@ def main():
# 7. Save # 7. Save
print(f"Saving to {args.hf_checkpoint_name}") print(f"Saving to {args.hf_checkpoint_name}")
hf_model.save_pretrained(args.hf_checkpoint_name) hf_model.save_pretrained(args.hf_checkpoint_name)
feature_extractor.save_pretrained(args.hf_checkpoint_name) image_processor.save_pretrained(args.hf_checkpoint_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -24,7 +24,7 @@ import torch ...@@ -24,7 +24,7 @@ import torch
from huggingface_hub import cached_download, hf_hub_url from huggingface_hub import cached_download, hf_hub_url
from PIL import Image from PIL import Image
from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
from transformers.utils import logging from transformers.utils import logging
...@@ -115,12 +115,12 @@ def convert_deformable_detr_checkpoint( ...@@ -115,12 +115,12 @@ def convert_deformable_detr_checkpoint(
config.id2label = id2label config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()} config.label2id = {v: k for k, v in id2label.items()}
# load feature extractor # load image processor
feature_extractor = DeformableDetrFeatureExtractor(format="coco_detection") image_processor = DeformableDetrImageProcessor(format="coco_detection")
# prepare image # prepare image
img = prepare_img() img = prepare_img()
encoding = feature_extractor(images=img, return_tensors="pt") encoding = image_processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"] pixel_values = encoding["pixel_values"]
logger.info("Converting model...") logger.info("Converting model...")
...@@ -185,11 +185,11 @@ def convert_deformable_detr_checkpoint( ...@@ -185,11 +185,11 @@ def convert_deformable_detr_checkpoint(
print("Everything ok!") print("Everything ok!")
# Save model and feature extractor # Save model and image processor
logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
feature_extractor.save_pretrained(pytorch_dump_folder_path) image_processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub # Push to hub
if push_to_hub: if push_to_hub:
......
...@@ -25,7 +25,7 @@ import torch ...@@ -25,7 +25,7 @@ import torch
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from PIL import Image from PIL import Image
from transformers import DeiTConfig, DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
from transformers.utils import logging from transformers.utils import logging
...@@ -182,12 +182,12 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path): ...@@ -182,12 +182,12 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
model = DeiTForImageClassificationWithTeacher(config).eval() model = DeiTForImageClassificationWithTeacher(config).eval()
model.load_state_dict(state_dict) model.load_state_dict(state_dict)
# Check outputs on an image, prepared by DeiTFeatureExtractor # Check outputs on an image, prepared by DeiTImageProcessor
size = int( size = int(
(256 / 224) * config.image_size (256 / 224) * config.image_size
) # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103 ) # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
feature_extractor = DeiTFeatureExtractor(size=size, crop_size=config.image_size) image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
encoding = feature_extractor(images=prepare_img(), return_tensors="pt") encoding = image_processor(images=prepare_img(), return_tensors="pt")
pixel_values = encoding["pixel_values"] pixel_values = encoding["pixel_values"]
outputs = model(pixel_values) outputs = model(pixel_values)
...@@ -198,8 +198,8 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path): ...@@ -198,8 +198,8 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {deit_name} to {pytorch_dump_folder_path}") print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}") print(f"Saving image processor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path) image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -25,7 +25,7 @@ import torch ...@@ -25,7 +25,7 @@ import torch
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from PIL import Image from PIL import Image
from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
from transformers.utils import logging from transformers.utils import logging
...@@ -201,13 +201,13 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): ...@@ -201,13 +201,13 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
config.id2label = id2label config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()} config.label2id = {v: k for k, v in id2label.items()}
# load feature extractor # load image processor
format = "coco_panoptic" if is_panoptic else "coco_detection" format = "coco_panoptic" if is_panoptic else "coco_detection"
feature_extractor = DetrFeatureExtractor(format=format) image_processor = DetrImageProcessor(format=format)
# prepare image # prepare image
img = prepare_img() img = prepare_img()
encoding = feature_extractor(images=img, return_tensors="pt") encoding = image_processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"] pixel_values = encoding["pixel_values"]
logger.info(f"Converting model {model_name}...") logger.info(f"Converting model {model_name}...")
...@@ -258,11 +258,11 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): ...@@ -258,11 +258,11 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
if is_panoptic: if is_panoptic:
assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
# Save model and feature extractor # Save model and image processor
logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
feature_extractor.save_pretrained(pytorch_dump_folder_path) image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -1341,8 +1341,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1341,8 +1341,7 @@ class DetrImageProcessor(BaseImageProcessor):
Args: Args:
results (`List[Dict]`): results (`List[Dict]`):
Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks" results will be Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
added.
outputs ([`DetrSegmentationOutput`]): outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model. Raw outputs of the model.
orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
......
...@@ -24,7 +24,7 @@ import torch ...@@ -24,7 +24,7 @@ import torch
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from PIL import Image from PIL import Image
from transformers import BeitConfig, BeitFeatureExtractor, BeitForImageClassification, BeitForMaskedImageModeling from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
from transformers.image_utils import PILImageResampling from transformers.image_utils import PILImageResampling
from transformers.utils import logging from transformers.utils import logging
...@@ -171,12 +171,12 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -171,12 +171,12 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
model.load_state_dict(state_dict) model.load_state_dict(state_dict)
# Check outputs on an image # Check outputs on an image
feature_extractor = BeitFeatureExtractor( image_processor = BeitImageProcessor(
size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
) )
image = prepare_img() image = prepare_img()
encoding = feature_extractor(images=image, return_tensors="pt") encoding = image_processor(images=image, return_tensors="pt")
pixel_values = encoding["pixel_values"] pixel_values = encoding["pixel_values"]
outputs = model(pixel_values) outputs = model(pixel_values)
...@@ -189,18 +189,18 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -189,18 +189,18 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}") print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}") print(f"Saving image processor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path) image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub: if push_to_hub:
if has_lm_head: if has_lm_head:
model_name = "dit-base" if "base" in checkpoint_url else "dit-large" model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
else: else:
model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip" model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
feature_extractor.push_to_hub( image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name), repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr", organization="nielsr",
commit_message="Add feature extractor", commit_message="Add image processor",
use_temp_dir=True, use_temp_dir=True,
) )
model.push_to_hub( model.push_to_hub(
......
...@@ -21,7 +21,7 @@ from datasets import load_dataset ...@@ -21,7 +21,7 @@ from datasets import load_dataset
from donut import DonutModel from donut import DonutModel
from transformers import ( from transformers import (
DonutFeatureExtractor, DonutImageProcessor,
DonutProcessor, DonutProcessor,
DonutSwinConfig, DonutSwinConfig,
DonutSwinModel, DonutSwinModel,
...@@ -152,10 +152,10 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ ...@@ -152,10 +152,10 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
image = dataset["test"][0]["image"].convert("RGB") image = dataset["test"][0]["image"].convert("RGB")
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
feature_extractor = DonutFeatureExtractor( image_processor = DonutImageProcessor(
do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1] do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
) )
processor = DonutProcessor(feature_extractor, tokenizer) processor = DonutProcessor(image_processor, tokenizer)
pixel_values = processor(image, return_tensors="pt").pixel_values pixel_values = processor(image, return_tensors="pt").pixel_values
if model_name == "naver-clova-ix/donut-base-finetuned-docvqa": if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
......
...@@ -24,7 +24,7 @@ import torch ...@@ -24,7 +24,7 @@ import torch
from huggingface_hub import cached_download, hf_hub_url from huggingface_hub import cached_download, hf_hub_url
from PIL import Image from PIL import Image
from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging from transformers.utils import logging
...@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
# Check outputs on an image # Check outputs on an image
size = 480 if "ade" in checkpoint_url else 384 size = 480 if "ade" in checkpoint_url else 384
feature_extractor = DPTFeatureExtractor(size=size) image_processor = DPTImageProcessor(size=size)
image = prepare_img() image = prepare_img()
encoding = feature_extractor(image, return_tensors="pt") encoding = image_processor(image, return_tensors="pt")
# forward pass # forward pass
outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
...@@ -271,12 +271,12 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -271,12 +271,12 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}") print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}") print(f"Saving image processor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path) image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub: if push_to_hub:
model.push_to_hub("ybelkada/dpt-hybrid-midas") model.push_to_hub("ybelkada/dpt-hybrid-midas")
feature_extractor.push_to_hub("ybelkada/dpt-hybrid-midas") image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -24,7 +24,7 @@ import torch ...@@ -24,7 +24,7 @@ import torch
from huggingface_hub import cached_download, hf_hub_url from huggingface_hub import cached_download, hf_hub_url
from PIL import Image from PIL import Image
from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
from transformers.utils import logging from transformers.utils import logging
...@@ -211,10 +211,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -211,10 +211,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
# Check outputs on an image # Check outputs on an image
size = 480 if "ade" in checkpoint_url else 384 size = 480 if "ade" in checkpoint_url else 384
feature_extractor = DPTFeatureExtractor(size=size) image_processor = DPTImageProcessor(size=size)
image = prepare_img() image = prepare_img()
encoding = feature_extractor(image, return_tensors="pt") encoding = image_processor(image, return_tensors="pt")
# forward pass # forward pass
outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
...@@ -233,8 +233,8 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -233,8 +233,8 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}") print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving feature extractor to {pytorch_dump_folder_path}") print(f"Saving image processor to {pytorch_dump_folder_path}")
feature_extractor.save_pretrained(pytorch_dump_folder_path) image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub: if push_to_hub:
print("Pushing model to hub...") print("Pushing model to hub...")
...@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
commit_message="Add model", commit_message="Add model",
use_temp_dir=True, use_temp_dir=True,
) )
feature_extractor.push_to_hub( image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name), repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr", organization="nielsr",
commit_message="Add feature extractor", commit_message="Add image processor",
use_temp_dir=True, use_temp_dir=True,
) )
......
...@@ -208,7 +208,7 @@ def convert_efficientformer_checkpoint( ...@@ -208,7 +208,7 @@ def convert_efficientformer_checkpoint(
) )
processor.push_to_hub( processor.push_to_hub(
repo_id=f"Bearnardd/{pytorch_dump_path}", repo_id=f"Bearnardd/{pytorch_dump_path}",
commit_message="Add feature extractor", commit_message="Add image processor",
use_temp_dir=True, use_temp_dir=True,
) )
...@@ -234,12 +234,12 @@ if __name__ == "__main__": ...@@ -234,12 +234,12 @@ if __name__ == "__main__":
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
) )
parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub") parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
parser.add_argument( parser.add_argument(
"--no-push_to_hub", "--no-push_to_hub",
dest="push_to_hub", dest="push_to_hub",
action="store_false", action="store_false",
help="Do not push model and feature extractor to the hub", help="Do not push model and image processor to the hub",
) )
parser.set_defaults(push_to_hub=True) parser.set_defaults(push_to_hub=True)
......
...@@ -537,8 +537,8 @@ EFFICIENTFORMER_START_DOCSTRING = r""" ...@@ -537,8 +537,8 @@ EFFICIENTFORMER_START_DOCSTRING = r"""
EFFICIENTFORMER_INPUTS_DOCSTRING = r""" EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
[`ViTFeatureExtractor.__call__`] for details. [`ViTImageProcessor.preprocess`] for details.
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. tensors for more detail.
......
...@@ -305,12 +305,12 @@ def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_m ...@@ -305,12 +305,12 @@ def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_m
# Create folder to save model # Create folder to save model
if not os.path.isdir(pytorch_dump_folder_path): if not os.path.isdir(pytorch_dump_folder_path):
os.mkdir(pytorch_dump_folder_path) os.mkdir(pytorch_dump_folder_path)
# Save converted model and feature extractor # Save converted model and image processor
hf_model.save_pretrained(pytorch_dump_folder_path) hf_model.save_pretrained(pytorch_dump_folder_path)
preprocessor.save_pretrained(pytorch_dump_folder_path) preprocessor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub: if push_to_hub:
# Push model and feature extractor to hub # Push model and image processor to hub
print(f"Pushing converted {model_name} to the hub...") print(f"Pushing converted {model_name} to the hub...")
model_name = f"efficientnet-{model_name}" model_name = f"efficientnet-{model_name}"
preprocessor.push_to_hub(model_name) preprocessor.push_to_hub(model_name)
...@@ -333,7 +333,7 @@ if __name__ == "__main__": ...@@ -333,7 +333,7 @@ if __name__ == "__main__":
help="Path to the output PyTorch model directory.", help="Path to the output PyTorch model directory.",
) )
parser.add_argument("--save_model", action="store_true", help="Save model to local") parser.add_argument("--save_model", action="store_true", help="Save model to local")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub") parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
args = parser.parse_args() args = parser.parse_args()
convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
...@@ -23,7 +23,7 @@ import requests ...@@ -23,7 +23,7 @@ import requests
import torch import torch
from PIL import Image from PIL import Image
from transformers import GLPNConfig, GLPNFeatureExtractor, GLPNForDepthEstimation from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
from transformers.utils import logging from transformers.utils import logging
...@@ -131,12 +131,12 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h ...@@ -131,12 +131,12 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
# load GLPN configuration (Segformer-B4 size) # load GLPN configuration (Segformer-B4 size)
config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3]) config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])
# load feature extractor (only resize + rescale) # load image processor (only resize + rescale)
feature_extractor = GLPNFeatureExtractor() image_processor = GLPNImageProcessor()
# prepare image # prepare image
image = prepare_img() image = prepare_img()
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
logger.info("Converting model...") logger.info("Converting model...")
...@@ -179,17 +179,17 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h ...@@ -179,17 +179,17 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
# finally, push to hub if required # finally, push to hub if required
if push_to_hub: if push_to_hub:
logger.info("Pushing model and feature extractor to the hub...") logger.info("Pushing model and image processor to the hub...")
model.push_to_hub( model.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name), repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr", organization="nielsr",
commit_message="Add model", commit_message="Add model",
use_temp_dir=True, use_temp_dir=True,
) )
feature_extractor.push_to_hub( image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name), repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr", organization="nielsr",
commit_message="Add feature extractor", commit_message="Add image processor",
use_temp_dir=True, use_temp_dir=True,
) )
......
...@@ -458,7 +458,7 @@ class GroupViTOnnxConfig(OnnxConfig): ...@@ -458,7 +458,7 @@ class GroupViTOnnxConfig(OnnxConfig):
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
) )
image_input_dict = super().generate_dummy_inputs( image_input_dict = super().generate_dummy_inputs(
processor.feature_extractor, batch_size=batch_size, framework=framework processor.image_processor, batch_size=batch_size, framework=framework
) )
return {**text_input_dict, **image_input_dict} return {**text_input_dict, **image_input_dict}
......
...@@ -81,7 +81,7 @@ class ImageGPTImageProcessor(BaseImageProcessor): ...@@ -81,7 +81,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
def __init__( def __init__(
self, self,
# clusters is a first argument to maintain backwards compatibility with the old ImageGPTFeatureExtractor # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
clusters: Optional[Union[List[List[int]], np.ndarray]] = None, clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
do_resize: bool = True, do_resize: bool = True,
size: Dict[str, int] = None, size: Dict[str, int] = None,
......
...@@ -260,7 +260,7 @@ class LayoutLMv3OnnxConfig(OnnxConfig): ...@@ -260,7 +260,7 @@ class LayoutLMv3OnnxConfig(OnnxConfig):
""" """
# A dummy image is used so OCR should not be applied # A dummy image is used so OCR should not be applied
setattr(processor.feature_extractor, "apply_ocr", False) setattr(processor.image_processor, "apply_ocr", False)
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
batch_size = compute_effective_axis_dimension( batch_size = compute_effective_axis_dimension(
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
""" """
Processor class for LayoutXLM. Processor class for LayoutXLM.
""" """
import warnings
from typing import List, Optional, Union from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin from ...processing_utils import ProcessorMixin
...@@ -24,26 +25,45 @@ from ...utils import TensorType ...@@ -24,26 +25,45 @@ from ...utils import TensorType
class LayoutXLMProcessor(ProcessorMixin): class LayoutXLMProcessor(ProcessorMixin):
r""" r"""
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
single processor. processor.
[`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model. [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
to get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
[`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD). into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args: Args:
feature_extractor (`LayoutLMv2FeatureExtractor`): image_processor (`LayoutLMv2ImageProcessor`):
An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input. An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`): tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input. An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
""" """
feature_extractor_class = "LayoutLMv2FeatureExtractor"
attributes = ["image_processor", "tokenizer"]
image_processor_class = "LayoutLMv2ImageProcessor"
tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast") tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
def __call__( def __call__(
self, self,
images, images,
...@@ -68,37 +88,37 @@ class LayoutXLMProcessor(ProcessorMixin): ...@@ -68,37 +88,37 @@ class LayoutXLMProcessor(ProcessorMixin):
**kwargs, **kwargs,
) -> BatchEncoding: ) -> BatchEncoding:
""" """
This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
[`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``. arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
Please refer to the docstring of the above two methods for more information. Please refer to the docstring of the above two methods for more information.
""" """
# verify input # verify input
if self.feature_extractor.apply_ocr and (boxes is not None): if self.image_processor.apply_ocr and (boxes is not None):
raise ValueError( raise ValueError(
"You cannot provide bounding boxes " "You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True." "if you initialized the image processor with apply_ocr set to True."
) )
if self.feature_extractor.apply_ocr and (word_labels is not None): if self.image_processor.apply_ocr and (word_labels is not None):
raise ValueError( raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True." "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
) )
if return_overflowing_tokens is True and return_offsets_mapping is False: if return_overflowing_tokens is True and return_offsets_mapping is False:
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.") raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
# first, apply the feature extractor # first, apply the image processor
features = self.feature_extractor(images=images, return_tensors=return_tensors) features = self.image_processor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer # second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None: if text is not None and self.image_processor.apply_ocr and text_pair is None:
if isinstance(text, str): if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension) text = [text] # add batch dimension (as the image processor always adds a batch dimension)
text_pair = features["words"] text_pair = features["words"]
encoded_inputs = self.tokenizer( encoded_inputs = self.tokenizer(
...@@ -162,3 +182,19 @@ class LayoutXLMProcessor(ProcessorMixin): ...@@ -162,3 +182,19 @@ class LayoutXLMProcessor(ProcessorMixin):
@property @property
def model_input_names(self): def model_input_names(self):
return ["input_ids", "bbox", "attention_mask", "image"] return ["input_ids", "bbox", "attention_mask", "image"]
@property
def feature_extractor_class(self):
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
@property
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
...@@ -25,7 +25,7 @@ import timm ...@@ -25,7 +25,7 @@ import timm
import torch import torch
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from transformers import LevitConfig, LevitFeatureExtractor, LevitForImageClassificationWithTeacher from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor
from transformers.utils import logging from transformers.utils import logging
...@@ -74,8 +74,8 @@ def convert_weight_and_push( ...@@ -74,8 +74,8 @@ def convert_weight_and_push(
if push_to_hub: if push_to_hub:
our_model.save_pretrained(save_directory / checkpoint_name) our_model.save_pretrained(save_directory / checkpoint_name)
feature_extractor = LevitFeatureExtractor() image_processor = LevitImageProcessor()
feature_extractor.save_pretrained(save_directory / checkpoint_name) image_processor.save_pretrained(save_directory / checkpoint_name)
print(f"Pushed {checkpoint_name}") print(f"Pushed {checkpoint_name}")
...@@ -167,12 +167,12 @@ if __name__ == "__main__": ...@@ -167,12 +167,12 @@ if __name__ == "__main__":
required=False, required=False,
help="Path to the output PyTorch model directory.", help="Path to the output PyTorch model directory.",
) )
parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub") parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
parser.add_argument( parser.add_argument(
"--no-push_to_hub", "--no-push_to_hub",
dest="push_to_hub", dest="push_to_hub",
action="store_false", action="store_false",
help="Do not push model and feature extractor to the hub", help="Do not push model and image processor to the hub",
) )
args = parser.parse_args() args = parser.parse_args()
......
...@@ -192,7 +192,7 @@ class OriginalMask2FormerConfigToOursConverter: ...@@ -192,7 +192,7 @@ class OriginalMask2FormerConfigToOursConverter:
return config return config
class OriginalMask2FormerConfigToFeatureExtractorConverter: class OriginalMask2FormerConfigToImageProcessorConverter:
def __call__(self, original_config: object) -> Mask2FormerImageProcessor: def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
model = original_config.MODEL model = original_config.MODEL
model_input = original_config.INPUT model_input = original_config.INPUT
...@@ -846,7 +846,7 @@ class OriginalMask2FormerCheckpointToOursConverter: ...@@ -846,7 +846,7 @@ class OriginalMask2FormerCheckpointToOursConverter:
def test( def test(
original_model, original_model,
our_model: Mask2FormerForUniversalSegmentation, our_model: Mask2FormerForUniversalSegmentation,
feature_extractor: Mask2FormerImageProcessor, image_processor: Mask2FormerImageProcessor,
tolerance: float, tolerance: float,
): ):
with torch.no_grad(): with torch.no_grad():
...@@ -854,7 +854,7 @@ def test( ...@@ -854,7 +854,7 @@ def test(
our_model = our_model.eval() our_model = our_model.eval()
im = prepare_img() im = prepare_img()
x = feature_extractor(images=im, return_tensors="pt")["pixel_values"] x = image_processor(images=im, return_tensors="pt")["pixel_values"]
original_model_backbone_features = original_model.backbone(x.clone()) original_model_backbone_features = original_model.backbone(x.clone())
our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True) our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
...@@ -979,10 +979,10 @@ if __name__ == "__main__": ...@@ -979,10 +979,10 @@ if __name__ == "__main__":
checkpoints_dir, config_dir checkpoints_dir, config_dir
): ):
model_name = get_model_name(checkpoint_file) model_name = get_model_name(checkpoint_file)
feature_extractor = OriginalMask2FormerConfigToFeatureExtractorConverter()( image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
setup_cfg(Args(config_file=config_file)) setup_cfg(Args(config_file=config_file))
) )
feature_extractor.size = {"height": 384, "width": 384} image_processor.size = {"height": 384, "width": 384}
original_config = setup_cfg(Args(config_file=config_file)) original_config = setup_cfg(Args(config_file=config_file))
mask2former_kwargs = OriginalMask2Former.from_config(original_config) mask2former_kwargs = OriginalMask2Former.from_config(original_config)
...@@ -1012,8 +1012,8 @@ if __name__ == "__main__": ...@@ -1012,8 +1012,8 @@ if __name__ == "__main__":
tolerance = 3e-1 tolerance = 3e-1
logger.info(f"🪄 Testing {model_name}...") logger.info(f"🪄 Testing {model_name}...")
test(original_model, mask2former_for_segmentation, feature_extractor, tolerance) test(original_model, mask2former_for_segmentation, image_processor, tolerance)
logger.info(f"🪄 Pushing {model_name} to hub...") logger.info(f"🪄 Pushing {model_name} to hub...")
feature_extractor.push_to_hub(model_name) image_processor.push_to_hub(model_name)
mask2former_for_segmentation.push_to_hub(model_name) mask2former_for_segmentation.push_to_hub(model_name)
...@@ -2106,8 +2106,8 @@ MASK2FORMER_START_DOCSTRING = r""" ...@@ -2106,8 +2106,8 @@ MASK2FORMER_START_DOCSTRING = r"""
MASK2FORMER_INPUTS_DOCSTRING = r""" MASK2FORMER_INPUTS_DOCSTRING = r"""
Args: Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoFeatureExtractor.__call__`] for details. [`AutoImageProcessor.preprocess`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment