Unverified Commit 15585b81 authored by Pavel Iakubovskii's avatar Pavel Iakubovskii Committed by GitHub
Browse files

Update object detection with latest resize and pad strategies (#30955)

* Update with new resizing and pad strategy

* Return pixel mask param

* Update inference in guide

* Fix empty compose

* Update guide
parent a25f7d3c
......@@ -206,10 +206,10 @@ Instantiate the image processor from the same checkpoint as the model you want t
>>> image_processor = AutoImageProcessor.from_pretrained(
... MODEL_NAME,
... # At this moment we recommend using external transform to pad and resize images.
... # It`s faster and yields better results for object-detection models.
... do_pad=False,
... do_resize=False,
... do_resize=True,
... size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
... do_pad=True,
... pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
... )
```
......@@ -217,38 +217,28 @@ Before passing the images to the `image_processor`, apply two preprocessing tran
- Augmenting images
- Reformatting annotations to meet DETR expectations
First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ...
First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/).
This library ensures that transformations affect the image and update the bounding boxes accordingly.
The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection),
and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480),
flip it horizontally, and brighten it. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).
and it uses the exact same dataset as an example. Apply some geometric and color transformations to the image. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).
```py
>>> import albumentations as A
>>> max_size = IMAGE_SIZE
>>> # Resize image longest edge to 480 and then pad image to square 480x480.
>>> # This padding and resizing strategy give better results, see
>>> # https://github.com/huggingface/transformers/pull/30422#discussion_r1584647408
>>> basic_transforms = [
... A.LongestMaxSize(max_size=max_size),
... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
... ]
>>> train_augment_and_transform = A.Compose(
... [
... A.Perspective(p=0.1),
... A.HorizontalFlip(p=0.5),
... A.RandomBrightnessContrast(p=0.5),
... A.HueSaturationValue(p=0.1),
... *basic_transforms,
... ],
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
... )
>>> validation_transform = A.Compose(
... basic_transforms,
... [A.NoOp()],
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
... )
```
......@@ -294,7 +284,7 @@ The `image_processor` expects the annotations to be in the following format: `{'
Now you can combine the image and annotation transformations to use on a batch of examples:
```py
>>> def augment_and_transform_batch(examples, transform, image_processor):
>>> def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
... """Apply augmentations and format annotations in COCO format for object detection task"""
... images = []
......@@ -315,6 +305,9 @@ Now you can combine the image and annotation transformations to use on a batch o
... # Apply the image processor transformations: resizing, rescaling, normalization
... result = image_processor(images=images, annotations=annotations, return_tensors="pt")
... if not return_pixel_mask:
... result.pop("pixel_mask", None)
... return result
```
......@@ -1485,25 +1478,12 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin
```py
>>> import torch
>>> import requests
>>> import numpy as np
>>> import albumentations as A
>>> from PIL import Image
>>> from PIL import Image, ImageDraw
>>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
>>> url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> # Define transformations for inference
>>> resize_and_pad = A.Compose([
... A.LongestMaxSize(max_size=max_size),
... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
... ])
>>> # This one is for visualization with no padding
>>> resize_only = A.Compose([
... A.LongestMaxSize(max_size=max_size),
... ])
```
Load model and image processor from the Hugging Face Hub (skip to use already trained in this session):
......@@ -1519,12 +1499,11 @@ Load model and image processor from the Hugging Face Hub (skip to use already tr
And detect bounding boxes:
```py
>>> np_preprocessed_image = resize_and_pad(image=np.array(image))["image"]
>>> with torch.no_grad():
... inputs = image_processor(images=[np_preprocessed_image], return_tensors="pt")
... outputs = model(inputs["pixel_values"].to(device))
... target_sizes = torch.tensor([np_preprocessed_image.shape[:2]])
... inputs = image_processor(images=[image], return_tensors="pt")
... outputs = model(**inputs.to(device))
... target_sizes = torch.tensor([[image.size[1], image.size[0]]])
... results = image_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
......@@ -1543,9 +1522,7 @@ Detected Coverall with confidence 0.391 at location [68.61, 126.66, 309.03, 318.
Let's plot the result:
```py
>>> resized_image = resize_only(image=np.array(image))["image"]
>>> resized_image = Image.fromarray(resized_image)
>>> draw = ImageDraw.Draw(resized_image)
>>> draw = ImageDraw.Draw(image)
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()]
......@@ -1553,7 +1530,7 @@ Let's plot the result:
... draw.rectangle((x, y, x2, y2), outline="red", width=1)
... draw.text((x, y), model.config.id2label[label.item()], fill="white")
>>> resized_image
>>> image
```
<div class="flex justify-center">
......
......@@ -50,7 +50,7 @@ python run_object_detection.py \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--remove_unused_columns false \
--eval_do_concat_batches false \
--eval_do_concat_batches false \
--ignore_mismatched_sizes true \
--metric_for_best_model eval_map \
--greater_is_better true \
......@@ -200,6 +200,7 @@ Where `metadata.jsonl` is a file with the following structure:
{"file_name": "0002.jpg", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "categories": [1], "id": [2], "area": [40.0]}}
...
```
Trining script support bounding boxes in COCO format (x_min, y_min, width, height).
Then, you cat load the dataset with just a few lines of code:
......
......@@ -117,7 +117,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
def augment_and_transform_batch(
examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
examples: Mapping[str, Any],
transform: A.Compose,
image_processor: AutoImageProcessor,
return_pixel_mask: bool = False,
) -> BatchFeature:
"""Apply augmentations and format annotations in COCO format for object detection task"""
......@@ -139,6 +142,9 @@ def augment_and_transform_batch(
# Apply the image processor transformations: resizing, rescaling, normalization
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
if not return_pixel_mask:
result.pop("pixel_mask", None)
return result
......@@ -415,12 +421,10 @@ def main():
)
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
# At this moment we recommend using external transform to pad and resize images.
# It`s faster and yields much better results for object-detection models.
do_pad=False,
do_resize=False,
# We will save image size parameter in config just for reference
size={"longest_edge": data_args.image_square_size},
do_resize=True,
size={"max_height": data_args.image_square_size, "max_width": data_args.image_square_size},
do_pad=True,
pad_size={"height": data_args.image_square_size, "width": data_args.image_square_size},
**common_pretrained_args,
)
......@@ -428,10 +432,6 @@ def main():
# Define image augmentations and dataset transforms
# ------------------------------------------------------------------------------------------------
max_size = data_args.image_square_size
basic_transforms = [
A.LongestMaxSize(max_size=max_size),
A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
]
train_augment_and_transform = A.Compose(
[
A.Compose(
......@@ -453,12 +453,11 @@ def main():
A.HorizontalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.5),
A.HueSaturationValue(p=0.1),
*basic_transforms,
],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
)
validation_transform = A.Compose(
basic_transforms,
[A.NoOp()],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
)
......
......@@ -120,7 +120,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
# Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
def augment_and_transform_batch(
examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
examples: Mapping[str, Any],
transform: A.Compose,
image_processor: AutoImageProcessor,
return_pixel_mask: bool = False,
) -> BatchFeature:
"""Apply augmentations and format annotations in COCO format for object detection task"""
......@@ -142,6 +145,9 @@ def augment_and_transform_batch(
# Apply the image processor transformations: resizing, rescaling, normalization
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
if not return_pixel_mask:
result.pop("pixel_mask", None)
return result
......@@ -473,12 +479,10 @@ def main():
)
image_processor = AutoImageProcessor.from_pretrained(
args.model_name_or_path,
# At this moment we recommend using external transform to pad and resize images.
# It`s faster and yields much better results for object-detection models.
do_pad=False,
do_resize=False,
# We will save image size parameter in config just for reference
size={"longest_edge": args.image_square_size},
do_resize=True,
size={"max_height": args.image_square_size, "max_width": args.image_square_size},
do_pad=True,
pad_size={"height": args.image_square_size, "width": args.image_square_size},
**common_pretrained_args,
)
......@@ -486,10 +490,6 @@ def main():
# Define image augmentations and dataset transforms
# ------------------------------------------------------------------------------------------------
max_size = args.image_square_size
basic_transforms = [
A.LongestMaxSize(max_size=max_size),
A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
]
train_augment_and_transform = A.Compose(
[
A.Compose(
......@@ -511,12 +511,11 @@ def main():
A.HorizontalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.5),
A.HueSaturationValue(p=0.1),
*basic_transforms,
],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
)
validation_transform = A.Compose(
basic_transforms,
[A.NoOp()],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment