Unverified Commit 5b949623 authored by Jitesh Jain's avatar Jitesh Jain Committed by GitHub
Browse files

Add OneFormer Model (#20577)

* Add Oneformer Model

* Add OneFormer Tests

* Add UNIVERSAL_SEGMENTATION_MAPPING

* Fix config

* 🐛 Fix error encountered while writing tests

* 🔨 Fix instance segmentation post processing

* Format Files and Add Documentation

* Add Documentation mdx file

* Run make fixup

* Run make fix-copies

* Remove unnecessary code

* Format modeling_oneformer.py

* Add OneFormer to ImageSegmentationPipeline

* Format files

* Add Demo link to Readme

* Fix fomatting errors

* Fix test failures

* Update Table in index.mdx

* Fix version

* Fix style

* Remove OneFormer from TF

* Fix Imports

* Fix dummy objects

* Fix tests

* Add newline

* Remove OneFormerFeatureExtractor

* Remove CUDA Kernels

* Use AutoBackbone for Swin

* Fix description

* Use Image Processor

* Fix copies

* Fix formatting

* Fix import order

* Fix flake8 errors

* Fix doc errors

* Add Hindi Readme entry

* Update supported backbones

* Update supported backbones

* Undo Changes

* Fix type of config

* Fix isort

* Fix auto.mdx

* Fix swin config

* Replace DinatBackbone with AutoBackbone

* Use SwinBackbone

* Use SwinBackbone

* Fix conversion script

* Fix arguments

* Add argument description

* Fix style

* Add OneFormerProcessor

* Fix OneFormerProcessor Tests

* Fix mapping

* Fix imports

* Fix inits

* Fix style

* Fix comment

* Fix docstring

* Move OneFormer to MultiModal

* Fix Copies

* Remove size divisor

* Fix check_repo.py

* Fix copies

* Add Processor for Testing Pipeline

* Fix padding for tokens

* Fix variables

* Fix formatting with correct black version

* Add Image Processor Test

* Apply suggestions

* Revert common modeling

* Add check for task

* Fix conversion script

* Fix initialization order

* Fix tests

* Undo Pipeline Changes

* Fix layers in MLP

* Fix copies

* Update image paths

* Fix copies

* Apply suggestions
parent 6d676643
......@@ -53,6 +53,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("layoutlmv3", "LayoutLMv3Processor"),
("layoutxlm", "LayoutXLMProcessor"),
("markuplm", "MarkupLMProcessor"),
("oneformer", "OneFormerProcessor"),
("owlvit", "OwlViTProcessor"),
("sew", "Wav2Vec2Processor"),
("sew-d", "Wav2Vec2Processor"),
......
......@@ -208,6 +208,7 @@ else:
"AlbertTokenizerFast" if is_tokenizers_available() else None,
),
),
("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
("opt", ("GPT2Tokenizer", None)),
("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_oneformer": ["ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "OneFormerConfig"],
"processing_oneformer": ["OneFormerProcessor"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_oneformer"] = ["OneFormerImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_oneformer"] = [
"ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"OneFormerForUniversalSegmentation",
"OneFormerModel",
"OneFormerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_oneformer import ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, OneFormerConfig
from .processing_oneformer import OneFormerProcessor
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_oneformer import OneFormerImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_oneformer import (
ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
OneFormerForUniversalSegmentation,
OneFormerModel,
OneFormerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""OneFormer model configuration"""
import copy
from typing import Dict, Optional
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"shi-labs/oneformer_ade20k_swin_tiny": (
"https://huggingface.co/shi-labs/oneformer_ade20k_swin_tiny/blob/main/config.json"
),
# See all OneFormer models at https://huggingface.co/models?filter=oneformer
}
class OneFormerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`OneFormerModel`]. It is used to instantiate a
OneFormer model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OneFormer
[shi-labs/oneformer_ade20k_swin_tiny](https://huggingface.co/shi-labs/oneformer_ade20k_swin_tiny) architecture
trained on [ADE20k-150](https://huggingface.co/datasets/scene_parse_150).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
backbone_config (`PretrainedConfig`, *optional*, defaults to `SwinConfig`)
The configuration of the backbone model.
ignore_value (`int`, *optional*, defaults to 255)
Values to be ignored in GT label while calculating loss.
num_queries (`int`, *optional*, defaults to 150)
Number of object queries.
no_object_weight (`float`, *optional*, defaults to 0.1)
Weight for no-object class predictions.
class_weight (`float`, *optional*, defaults to 2.0)
Weight for Classification CE loss.
mask_weight (`float`, *optional*, defaults to 5.0)
Weight for binary CE loss.
dice_weight (`float`, *optional*, defaults to 5.0)
Weight for dice loss.
contrastive_weight (`float`, *optional*, defaults to 0.5)
Weight for contrastive loss.
contrastive_temperature (`float`, *optional*, defaults to 0.07)
Initial value for scaling the contrastive logits.
train_num_points (`int`, *optional*, defaults to 12544)
Number of points to sample while calculating losses on mask predictions.
oversample_ratio (`float`, *optional*, defaults to 3.0)
Ratio to decide how many points to oversample.
importance_sample_ratio (`float`, *optional*, defaults to 0.75)
Ratio of points that are sampled via importance sampling.
init_std (`float`, *optional*, defaults to 0.02)
Standard deviation for normal intialization.
init_xavier_std (`float`, *optional*, defaults to 0.02)
Standard deviation for xavier uniform initialization.
layer_norm_eps (`float`, *optional*, defaults to 1e-05)
Epsilon for layer normalization.
is_training (`bool`, *optional*, defaults to False)
Whether to run in training or inference mode.
use_auxiliary_loss (`bool`, *optional*, defaults to True)
Whether to calculate loss using intermediate predictions from transformer decoder.
output_auxiliary_logits (`bool`, *optional*, defaults to True)
Whether to return intermediate predictions from transformer decoder.
strides (`list`, *optional*, defaults to [4, 8, 16, 32])
List containing the strides for feature maps in the encoder.
task_seq_len (`int`, *optional*, defaults to 77)
Sequence length for tokenizing text list input.
max_seq_len (`int`, *optional*, defaults to 77)
Sequence length for tokenizing task input.
text_encoder_width (`int`, *optional*, defaults to 256)
Hidden size for text encoder.
text_encoder_context_length (`int`, *optional*, defaults to 77):
Input sequence length for text encoder.
text_encoder_num_layers (`int`, *optional*, defaults to 6)
Number of layers for transformer in text encoder.
text_encoder_vocab_size (`int`, *optional*, defaults to 49408)
Vocabulary size for tokenizer.
text_encoder_proj_layers (`int`, *optional*, defaults to 2)
Number of layers in MLP for project text queries.
text_encoder_n_ctx (`int`, *optional*, defaults to 16)
Number of learnable text context queries.
conv_dim (`int`, *optional*, defaults to 256)
Feature map dimension to map outputs from the backbone.
mask_dim (`int`, *optional*, defaults to 256)
Dimension for feature maps in pixel decoder.
hidden_dim (`int`, *optional*, defaults to 256)
Dimension for hidden states in transformer decoder.
encoder_feedforward_dim (`int`, *optional*, defaults to 1024)
Dimension for FFN layer in pixel decoder.
norm (`str`, *optional*, defaults to `GN`)
Type of normalization.
encoder_layers (`int`, *optional*, defaults to 6)
Number of layers in pixel decoder.
decoder_layers (`int`, *optional*, defaults to 10)
Number of layers in transformer decoder.
use_task_norm (`bool`, *optional*, defaults to `True`)
Whether to normalize the task token.
num_attention_heads (`int`, *optional*, defaults to 8)
Number of attention heads in transformer layers in the pixel and transformer decoders.
dropout (`float`, *optional*, defaults to 0.1)
Dropout probability for pixel and transformer decoders.
dim_feedforward (`int`, *optional*, defaults to 2048)
Dimension for FFN layer in transformer decoder.
pre_norm (`bool`, *optional*, defaults to `False`)
Whether to normalize hidden states before attention layers in transformer decoder.
enforce_input_proj (`bool`, *optional*, defaults to `False`)
Whether to project hidden states in transformer decoder.
query_dec_layers (`int`, *optional*, defaults to 2)
Number of layers in query transformer.
common_stride (`int`, *optional*, defaults to 4)
Common stride used for features in pixel decoder.
Examples:
```python
>>> from transformers import OneFormerConfig, OneFormerModel
>>> # Initializing a OneFormer shi-labs/oneformer_ade20k_swin_tiny configuration
>>> configuration = OneFormerConfig()
>>> # Initializing a model (with random weights) from the shi-labs/oneformer_ade20k_swin_tiny style configuration
>>> model = OneFormerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "oneformer"
attribute_map = {"hidden_size": "hidden_dim"}
def __init__(
self,
backbone_config: Optional[Dict] = None,
ignore_value: int = 255,
num_queries: int = 150,
no_object_weight: int = 0.1,
class_weight: float = 2.0,
mask_weight: float = 5.0,
dice_weight: float = 5.0,
contrastive_weight: float = 0.5,
contrastive_temperature: float = 0.07,
train_num_points: int = 12544,
oversample_ratio: float = 3.0,
importance_sample_ratio: float = 0.75,
init_std: float = 0.02,
init_xavier_std: float = 1.0,
layer_norm_eps: float = 1e-05,
is_training: bool = False,
use_auxiliary_loss: bool = True,
output_auxiliary_logits: bool = True,
strides: Optional[list] = [4, 8, 16, 32],
task_seq_len: int = 77,
max_seq_len: int = 77,
text_encoder_width: int = 256,
text_encoder_context_length: int = 77,
text_encoder_num_layers: int = 6,
text_encoder_vocab_size: int = 49408,
text_encoder_proj_layers: int = 2,
text_encoder_n_ctx: int = 16,
conv_dim: int = 256,
mask_dim: int = 256,
hidden_dim: int = 256,
encoder_feedforward_dim: int = 1024,
norm: str = "GN",
encoder_layers: int = 6,
decoder_layers: int = 10,
use_task_norm: bool = True,
num_attention_heads: int = 8,
dropout: float = 0.1,
dim_feedforward: int = 2048,
pre_norm: bool = False,
enforce_input_proj: bool = False,
query_dec_layers: int = 2,
common_stride: int = 4,
**kwargs,
):
if backbone_config is None:
logger.info("`backbone_config` is unset. Initializing the config with the default `Swin` backbone.")
backbone_config = CONFIG_MAPPING["swin"](
image_size=224,
in_channels=3,
patch_size=4,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
drop_path_rate=0.3,
use_absolute_embeddings=False,
out_features=["stage1", "stage2", "stage3", "stage4"],
)
elif isinstance(backbone_config, dict):
backbone_model_type = backbone_config.get("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
self.backbone_config = backbone_config
self.ignore_value = ignore_value
self.num_queries = num_queries
self.no_object_weight = no_object_weight
self.class_weight = class_weight
self.mask_weight = mask_weight
self.dice_weight = dice_weight
self.contrastive_weight = contrastive_weight
self.contrastive_temperature = contrastive_temperature
self.train_num_points = train_num_points
self.oversample_ratio = oversample_ratio
self.importance_sample_ratio = importance_sample_ratio
self.init_std = init_std
self.init_xavier_std = init_xavier_std
self.layer_norm_eps = layer_norm_eps
self.is_training = is_training
self.use_auxiliary_loss = use_auxiliary_loss
self.output_auxiliary_logits = output_auxiliary_logits
self.strides = strides
self.task_seq_len = task_seq_len
self.max_seq_len = max_seq_len
self.text_encoder_width = text_encoder_width
self.text_encoder_context_length = text_encoder_context_length
self.text_encoder_num_layers = text_encoder_num_layers
self.text_encoder_vocab_size = text_encoder_vocab_size
self.text_encoder_proj_layers = text_encoder_proj_layers
self.text_encoder_n_ctx = text_encoder_n_ctx
self.conv_dim = conv_dim
self.mask_dim = mask_dim
self.hidden_dim = hidden_dim
self.encoder_feedforward_dim = encoder_feedforward_dim
self.norm = norm
self.encoder_layers = encoder_layers
self.decoder_layers = decoder_layers
self.use_task_norm = use_task_norm
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.dim_feedforward = dim_feedforward
self.pre_norm = pre_norm
self.enforce_input_proj = enforce_input_proj
self.query_dec_layers = query_dec_layers
self.common_stride = common_stride
self.num_hidden_layers = decoder_layers
super().__init__(**kwargs)
def to_dict(self) -> Dict[str, any]:
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["backbone_config"] = self.backbone_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
This diff is collapsed.
This diff is collapsed.
# coding=utf-8
# Copyright 2022 SHI Labs and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image/Text processor class for OneFormer
"""
from typing import List
from transformers.utils import is_torch_available
from ...processing_utils import ProcessorMixin
if is_torch_available():
import torch
class OneFormerProcessor(ProcessorMixin):
r"""
Constructs an OneFormer processor which wraps [`OneFormerImageProcessor`] and
[`CLIPTokenizer`]/[`CLIPTokenizerFast`] into a single processor that inherits both the image processor and
tokenizer functionalities.
Args:
image_processor ([`OneFormerImageProcessor`]):
The image processor is a required input.
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
The tokenizer is a required input.
max_seq_len (`int`, *optional*, defaults to 77)):
Sequence length for input text list.
task_seq_len (`int`, *optional*, defaults to 77):
Sequence length for input task token.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "OneFormerImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(
self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
self.max_seq_length = max_seq_length
self.task_seq_length = task_seq_length
super().__init__(image_processor, tokenizer)
def _preprocess_text(self, text_list=None, max_length=77):
if text_list is None:
raise ValueError("tokens cannot be None.")
tokens = self.tokenizer(text_list, padding="max_length", max_length=max_length, truncation=True)
attention_masks, input_ids = tokens["attention_mask"], tokens["input_ids"]
token_inputs = []
for attn_mask, input_id in zip(attention_masks, input_ids):
token = torch.tensor(attn_mask) * torch.tensor(input_id)
token_inputs.append(token.unsqueeze(0))
token_inputs = torch.cat(token_inputs, dim=0)
return token_inputs
def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs):
"""
Main method to prepare for the model one or several task input(s) and image(s). This method forwards the
`task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
`None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
doctsring of the above two methods for more information.
Args:
task_inputs (`str`, `List[str]`):
The sequence or batch of task_inputs sequences to be encoded. Each sequence can be a string or a list
of strings of the template "the task is {task}".
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
`List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
segmentation_maps (`ImageInput`, *optional*):
The corresponding semantic segmentation maps with the pixel-wise annotations.
(`bool`, *optional*, defaults to `True`):
Whether or not to pad images up to the largest image in a batch and create a pixel mask.
If left to the default, will return a pixel mask that is:
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **task_inputs** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if task_inputs is None:
raise ValueError("You have to specify the task_input. Found None.")
elif images is None:
raise ValueError("You have to specify the image. Found None.")
if not all(task in ["semantic", "instance", "panoptic"] for task in task_inputs):
raise ValueError("task_inputs must be semantic, instance, or panoptic.")
encoded_inputs = self.image_processor(images, task_inputs, segmentation_maps, **kwargs)
if isinstance(task_inputs, str):
task_inputs = [task_inputs]
if isinstance(task_inputs, List) and all(isinstance(task_input, str) for task_input in task_inputs):
task_token_inputs = []
for task in task_inputs:
task_input = f"the task is {task}"
task_token_inputs.append(task_input)
encoded_inputs["task_inputs"] = self._preprocess_text(task_token_inputs, max_length=self.task_seq_length)
else:
raise TypeError("Task Inputs should be a string or a list of strings.")
if hasattr(encoded_inputs, "text_inputs"):
texts_list = encoded_inputs.text_inputs
text_inputs = []
for texts in texts_list:
text_input_list = self._preprocess_text(texts, max_length=self.max_seq_length)
text_inputs.append(text_input_list.unsqueeze(0))
encoded_inputs["text_inputs"] = torch.cat(text_inputs, dim=0)
return encoded_inputs
def encode_inputs(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs):
"""
This method forwards all its arguments to [`OneFormerImageProcessor.encode_inputs`] and then tokenizes the
task_inputs. Please refer to the docstring of this method for more information.
"""
if task_inputs is None:
raise ValueError("You have to specify the task_input. Found None.")
elif images is None:
raise ValueError("You have to specify the image. Found None.")
if not all(task in ["semantic", "instance", "panoptic"] for task in task_inputs):
raise ValueError("task_inputs must be semantic, instance, or panoptic.")
encoded_inputs = self.image_processor.encode_inputs(images, task_inputs, segmentation_maps, **kwargs)
if isinstance(task_inputs, str):
task_inputs = [task_inputs]
if isinstance(task_inputs, List) and all(isinstance(task_input, str) for task_input in task_inputs):
task_token_inputs = []
for task in task_inputs:
task_input = f"the task is {task}"
task_token_inputs.append(task_input)
encoded_inputs["task_inputs"] = self._preprocess_text(task_token_inputs, max_length=self.task_seq_length)
else:
raise TypeError("Task Inputs should be a string or a list of strings.")
if hasattr(encoded_inputs, "text_inputs"):
texts_list = encoded_inputs.text_inputs
text_inputs = []
for texts in texts_list:
text_input_list = self._preprocess_text(texts, max_length=self.max_seq_length)
text_inputs.append(text_input_list.unsqueeze(0))
encoded_inputs["text_inputs"] = torch.cat(text_inputs, dim=0)
return encoded_inputs
def post_process_semantic_segmentation(self, *args, **kwargs):
"""
This method forwards all its arguments to [`OneFormerImageProcessor.post_process_semantic_segmentation`].
Please refer to the docstring of this method for more information.
"""
return self.image_processor.post_process_semantic_segmentation(*args, **kwargs)
def post_process_instance_segmentation(self, *args, **kwargs):
"""
This method forwards all its arguments to [`OneFormerImageProcessor.post_process_instance_segmentation`].
Please refer to the docstring of this method for more information.
"""
return self.image_processor.post_process_instance_segmentation(*args, **kwargs)
def post_process_panoptic_segmentation(self, *args, **kwargs):
"""
This method forwards all its arguments to [`OneFormerImageProcessor.post_process_panoptic_segmentation`].
Please refer to the docstring of this method for more information.
"""
return self.image_processor.post_process_panoptic_segmentation(*args, **kwargs)
......@@ -4242,6 +4242,30 @@ class NystromformerPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
ONEFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
class OneFormerForUniversalSegmentation(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class OneFormerModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class OneFormerPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -318,6 +318,13 @@ class MobileViTImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class OneFormerImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class OwlViTFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
......
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import unittest
import numpy as np
from huggingface_hub import hf_hub_download
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from transformers import OneFormerImageProcessor
from transformers.models.oneformer.image_processing_oneformer import binary_mask_to_rle
from transformers.models.oneformer.modeling_oneformer import OneFormerForUniversalSegmentationOutput
if is_vision_available():
from PIL import Image
def prepare_metadata(class_info_file, repo_path="shi-labs/oneformer_demo"):
with open(hf_hub_download(repo_path, class_info_file, repo_type="dataset"), "r") as f:
class_info = json.load(f)
metadata = {}
class_names = []
thing_ids = []
for key, info in class_info.items():
metadata[key] = info["name"]
class_names.append(info["name"])
if info["isthing"]:
thing_ids.append(int(key))
metadata["thing_ids"] = thing_ids
metadata["class_names"] = class_names
return metadata
class OneFormerImageProcessorTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
min_resolution=30,
max_resolution=400,
size=None,
do_resize=True,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
num_labels=10,
reduce_labels=False,
ignore_index=255,
repo_path="shi-labs/oneformer_demo",
class_info_file="ade20k_panoptic.json",
num_text=10,
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.class_info_file = class_info_file
self.metadata = prepare_metadata(class_info_file, repo_path)
self.num_text = num_text
self.repo_path = repo_path
# for the post_process_functions
self.batch_size = 2
self.num_queries = 10
self.num_classes = 10
self.height = 3
self.width = 4
self.num_labels = num_labels
self.reduce_labels = reduce_labels
self.ignore_index = ignore_index
def prepare_feat_extract_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"num_labels": self.num_labels,
"reduce_labels": self.reduce_labels,
"ignore_index": self.ignore_index,
"class_info_file": self.class_info_file,
"metadata": self.metadata,
"num_text": self.num_text,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to OneFormerImageProcessor,
assuming do_resize is set to True with a scalar size.
"""
if not batched:
image = image_inputs[0]
if isinstance(image, Image.Image):
w, h = image.size
else:
h, w = image.shape[1], image.shape[2]
if w < h:
expected_height = int(self.size["shortest_edge"] * h / w)
expected_width = self.size["shortest_edge"]
elif w > h:
expected_height = self.size["shortest_edge"]
expected_width = int(self.size["shortest_edge"] * w / h)
else:
expected_height = self.size["shortest_edge"]
expected_width = self.size["shortest_edge"]
else:
expected_values = []
for image in image_inputs:
expected_height, expected_width = self.get_expected_values([image])
expected_values.append((expected_height, expected_width))
expected_height = max(expected_values, key=lambda item: item[0])[0]
expected_width = max(expected_values, key=lambda item: item[1])[1]
return expected_height, expected_width
def get_fake_oneformer_outputs(self):
return OneFormerForUniversalSegmentationOutput(
# +1 for null class
class_queries_logits=torch.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
masks_queries_logits=torch.randn((self.batch_size, self.num_queries, self.height, self.width)),
)
@require_torch
@require_vision
class OneFormerImageProcessingTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
image_processing_class = OneFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
# only for test_feat_extracttion_common.test_feat_extract_to_json_string
feature_extraction_class = image_processing_class
def setUp(self):
self.image_processing_tester = OneFormerImageProcessorTester(self)
@property
def feat_extract_dict(self):
return self.image_processing_tester.prepare_feat_extract_dict()
def test_feat_extract_properties(self):
image_processor = self.image_processing_class(**self.feat_extract_dict)
self.assertTrue(hasattr(image_processor, "image_mean"))
self.assertTrue(hasattr(image_processor, "image_std"))
self.assertTrue(hasattr(image_processor, "do_normalize"))
self.assertTrue(hasattr(image_processor, "do_resize"))
self.assertTrue(hasattr(image_processor, "size"))
self.assertTrue(hasattr(image_processor, "ignore_index"))
self.assertTrue(hasattr(image_processor, "class_info_file"))
self.assertTrue(hasattr(image_processor, "num_text"))
self.assertTrue(hasattr(image_processor, "repo_path"))
self.assertTrue(hasattr(image_processor, "metadata"))
self.assertTrue(hasattr(image_processor, "reduce_labels"))
def test_batch_feature(self):
pass
def test_call_pil(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.feat_extract_dict)
# create random PIL images
image_inputs = prepare_image_inputs(self.image_processing_tester, equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processor(image_inputs[0], ["semantic"], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processing_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processing_tester.num_channels, expected_height, expected_width),
)
# Test batched
expected_height, expected_width = self.image_processing_tester.get_expected_values(image_inputs, batched=True)
encoded_images = image_processor(
image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
).pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processing_tester.batch_size,
self.image_processing_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_numpy(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.feat_extract_dict)
# create random numpy tensors
image_inputs = prepare_image_inputs(self.image_processing_tester, equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processor(image_inputs[0], ["semantic"], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processing_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processing_tester.num_channels, expected_height, expected_width),
)
# Test batched
expected_height, expected_width = self.image_processing_tester.get_expected_values(image_inputs, batched=True)
encoded_images = image_processor(
image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
).pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processing_tester.batch_size,
self.image_processing_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_pytorch(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.feat_extract_dict)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.image_processing_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processor(image_inputs[0], ["semantic"], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processing_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processing_tester.num_channels, expected_height, expected_width),
)
# Test batched
expected_height, expected_width = self.image_processing_tester.get_expected_values(image_inputs, batched=True)
encoded_images = image_processor(
image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
).pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processing_tester.batch_size,
self.image_processing_tester.num_channels,
expected_height,
expected_width,
),
)
def test_equivalence_pad_and_create_pixel_mask(self):
# Initialize image_processors
image_processor_1 = self.image_processing_class(**self.feat_extract_dict)
image_processor_2 = self.image_processing_class(
do_resize=False,
do_normalize=False,
do_rescale=False,
num_labels=self.image_processing_tester.num_classes,
class_info_file="ade20k_panoptic.json",
num_text=self.image_processing_tester.num_text,
repo_path="shi-labs/oneformer_demo",
)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.image_processing_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
encoded_images_with_method = image_processor_1.encode_inputs(
image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
)
encoded_images = image_processor_2(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt")
self.assertTrue(
torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
)
self.assertTrue(
torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
)
def comm_get_image_processor_inputs(
self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
):
image_processor = self.image_processing_class(**self.feat_extract_dict)
# prepare image and target
num_labels = self.image_processing_tester.num_labels
annotations = None
instance_id_to_semantic_id = None
image_inputs = prepare_image_inputs(self.image_processing_tester, equal_resolution=False)
if with_segmentation_maps:
high = num_labels
if is_instance_map:
labels_expanded = list(range(num_labels)) * 2
instance_id_to_semantic_id = {
instance_id: label_id for instance_id, label_id in enumerate(labels_expanded)
}
annotations = [
np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
]
if segmentation_type == "pil":
annotations = [Image.fromarray(annotation) for annotation in annotations]
inputs = image_processor(
image_inputs,
["semantic"] * len(image_inputs),
annotations,
return_tensors="pt",
instance_id_to_semantic_id=instance_id_to_semantic_id,
pad_and_return_pixel_mask=True,
)
return inputs
def test_init_without_params(self):
pass
def test_call_with_segmentation_maps(self):
def common(is_instance_map=False, segmentation_type=None):
inputs = self.comm_get_image_processor_inputs(
with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
)
mask_labels = inputs["mask_labels"]
class_labels = inputs["class_labels"]
pixel_values = inputs["pixel_values"]
text_inputs = inputs["text_inputs"]
# check the batch_size
for mask_label, class_label, text_input in zip(mask_labels, class_labels, text_inputs):
self.assertEqual(mask_label.shape[0], class_label.shape[0])
# this ensure padding has happened
self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
self.assertEqual(len(text_input), self.image_processing_tester.num_text)
common()
common(is_instance_map=True)
common(is_instance_map=False, segmentation_type="pil")
common(is_instance_map=True, segmentation_type="pil")
def test_binary_mask_to_rle(self):
fake_binary_mask = np.zeros((20, 50))
fake_binary_mask[0, 20:] = 1
fake_binary_mask[1, :15] = 1
fake_binary_mask[5, :10] = 1
rle = binary_mask_to_rle(fake_binary_mask)
self.assertEqual(len(rle), 4)
self.assertEqual(rle[0], 21)
self.assertEqual(rle[1], 45)
def test_post_process_semantic_segmentation(self):
fature_extractor = self.image_processing_class(
num_labels=self.image_processing_tester.num_classes,
max_seq_length=77,
task_seq_length=77,
class_info_file="ade20k_panoptic.json",
num_text=self.image_processing_tester.num_text,
repo_path="shi-labs/oneformer_demo",
)
outputs = self.image_processing_tester.get_fake_oneformer_outputs()
segmentation = fature_extractor.post_process_semantic_segmentation(outputs)
self.assertEqual(len(segmentation), self.image_processing_tester.batch_size)
self.assertEqual(
segmentation[0].shape,
(
self.image_processing_tester.height,
self.image_processing_tester.width,
),
)
target_sizes = [(1, 4) for i in range(self.image_processing_tester.batch_size)]
segmentation = fature_extractor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
self.assertEqual(segmentation[0].shape, target_sizes[0])
def test_post_process_instance_segmentation(self):
image_processor = self.image_processing_class(
num_labels=self.image_processing_tester.num_classes,
max_seq_length=77,
task_seq_length=77,
class_info_file="ade20k_panoptic.json",
num_text=self.image_processing_tester.num_text,
repo_path="shi-labs/oneformer_demo",
)
outputs = self.image_processing_tester.get_fake_oneformer_outputs()
segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
self.assertTrue(len(segmentation) == self.image_processing_tester.batch_size)
for el in segmentation:
self.assertTrue("segmentation" in el)
self.assertTrue("segments_info" in el)
self.assertEqual(type(el["segments_info"]), list)
self.assertEqual(
el["segmentation"].shape, (self.image_processing_tester.height, self.image_processing_tester.width)
)
def test_post_process_panoptic_segmentation(self):
image_processor = self.image_processing_class(
num_labels=self.image_processing_tester.num_classes,
max_seq_length=77,
task_seq_length=77,
class_info_file="ade20k_panoptic.json",
num_text=self.image_processing_tester.num_text,
repo_path="shi-labs/oneformer_demo",
)
outputs = self.image_processing_tester.get_fake_oneformer_outputs()
segmentation = image_processor.post_process_panoptic_segmentation(outputs, threshold=0)
self.assertTrue(len(segmentation) == self.image_processing_tester.batch_size)
for el in segmentation:
self.assertTrue("segmentation" in el)
self.assertTrue("segments_info" in el)
self.assertEqual(type(el["segments_info"]), list)
self.assertEqual(
el["segmentation"].shape, (self.image_processing_tester.height, self.image_processing_tester.width)
)
This diff is collapsed.
This diff is collapsed.
......@@ -124,6 +124,8 @@ src/transformers/models/mobilevit/modeling_tf_mobilevit.py
src/transformers/models/nat/configuration_nat.py
src/transformers/models/nat/modeling_nat.py
src/transformers/models/nezha/configuration_nezha.py
src/transformers/models/oneformer/configuration_oneformer.py
src/transformers/models/oneformer/modeling_oneformer.py
src/transformers/models/openai/configuration_openai.py
src/transformers/models/opt/configuration_opt.py
src/transformers/models/opt/modeling_opt.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment