"vscode:/vscode.git/clone" did not exist on "d8b641c8393f1f7ce0ec0d0a1f7c1cefcd966971"
Unverified Commit 56baa033 authored by StevenBucaille's avatar StevenBucaille Committed by GitHub
Browse files

Implementation of SuperPoint and AutoModelForKeypointDetection (#28966)



* Added SuperPoint docs

* Added tests

* Removed commented part

* Commit to create and fix add_superpoint branch with a new branch

* Fixed dummy_pt_objects

* Committed missing files

* Fixed README.md

* Apply suggestions from code review

Fixed small changes
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Moved ImagePointDescriptionOutput from modeling_outputs.py to modeling_superpoint.py

* Removed AutoModelForKeypointDetection and related stuff

* Fixed inconsistencies in image_processing_superpoint.py

* Moved infer_on_model logic simply in test_inference

* Fixed bugs, added labels to forward method with checks whether it is properly a None value, also added tests about this logic in test_modeling_superpoint.py

* Added tests to SuperPointImageProcessor to ensure that images are properly converted to grayscale

* Removed remaining mentions of MODEL_FOR_KEYPOINT_DETECTION_MAPPING

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Fixed from (w, h) to (h, w) as input for tests

* Removed unnecessary condition

* Moved last_hidden_state to be the first returned

* Moved last_hidden_state to be the first returned (bis)

* Moved last_hidden_state to be the first returned (ter)

* Switched image_width and image_height in tests to match recent changes

* Added config as first SuperPointConvBlock init argument

* Reordered README's after merge

* Added missing first config argument to SuperPointConvBlock instantiations

* Removed formatting error

* Added SuperPoint to README's de, pt-br, ru, te and vi

* Checked out README_fr.md

* Fixed README_fr.md

* Test fix README_fr.md

* Test fix README_fr.md

* Last make fix-copies !

* Updated checkpoint path

* Removed unused SuperPoint doc

* Added missing image

* Update src/transformers/models/superpoint/modeling_superpoint.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Removed unnecessary import

* Update src/transformers/models/superpoint/modeling_superpoint.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Added SuperPoint to _toctree.yml

---------
Co-authored-by: default avatarsteven <steven.bucaillle@gmail.com>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: default avatarSteven Bucaille <steven.bucaille@buawei.com>
parent 2f9a3edb
......@@ -207,6 +207,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("squeezebert", "SqueezeBertModel"),
("stablelm", "StableLmModel"),
("starcoder2", "Starcoder2Model"),
("superpoint", "SuperPointModel"),
("swiftformer", "SwiftFormerModel"),
("swin", "SwinModel"),
("swin2sr", "Swin2SRModel"),
......
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
# rely on isort to merge the imports
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_superpoint": [
"SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"SuperPointConfig",
]
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_superpoint"] = ["SuperPointImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_superpoint"] = [
"SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST",
"SuperPointModel",
"SuperPointPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_superpoint import (
SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP,
SuperPointConfig,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_superpoint import SuperPointImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_superpoint import (
SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST,
SuperPointModel,
SuperPointPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"magic-leap-community/superpoint": "https://huggingface.co/magic-leap-community/superpoint/blob/main/config.json"
}
class SuperPointConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SuperPointModel`]. It is used to instantiate a
SuperPoint model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SuperPoint
[magic-leap-community/superpoint](https://huggingface.co/magic-leap-community/superpoint) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
encoder_hidden_sizes (`List`, *optional*, defaults to `[64, 64, 128, 128]`):
The number of channels in each convolutional layer in the encoder.
decoder_hidden_size (`int`, *optional*, defaults to 256): The hidden size of the decoder.
keypoint_decoder_dim (`int`, *optional*, defaults to 65): The output dimension of the keypoint decoder.
descriptor_decoder_dim (`int`, *optional*, defaults to 256): The output dimension of the descriptor decoder.
keypoint_threshold (`float`, *optional*, defaults to 0.005):
The threshold to use for extracting keypoints.
max_keypoints (`int`, *optional*, defaults to -1):
The maximum number of keypoints to extract. If `-1`, will extract all keypoints.
nms_radius (`int`, *optional*, defaults to 4):
The radius for non-maximum suppression.
border_removal_distance (`int`, *optional*, defaults to 4):
The distance from the border to remove keypoints.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example:
```python
>>> from transformers import SuperPointConfig, SuperPointModel
>>> # Initializing a SuperPoint superpoint style configuration
>>> configuration = SuperPointConfig()
>>> # Initializing a model from the superpoint style configuration
>>> model = SuperPointModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "superpoint"
def __init__(
self,
encoder_hidden_sizes: List[int] = [64, 64, 128, 128],
decoder_hidden_size: int = 256,
keypoint_decoder_dim: int = 65,
descriptor_decoder_dim: int = 256,
keypoint_threshold: float = 0.005,
max_keypoints: int = -1,
nms_radius: int = 4,
border_removal_distance: int = 4,
initializer_range=0.02,
**kwargs,
):
self.encoder_hidden_sizes = encoder_hidden_sizes
self.decoder_hidden_size = decoder_hidden_size
self.keypoint_decoder_dim = keypoint_decoder_dim
self.descriptor_decoder_dim = descriptor_decoder_dim
self.keypoint_threshold = keypoint_threshold
self.max_keypoints = max_keypoints
self.nms_radius = nms_radius
self.border_removal_distance = border_removal_distance
self.initializer_range = initializer_range
super().__init__(**kwargs)
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import requests
import torch
from PIL import Image
from transformers import SuperPointConfig, SuperPointImageProcessor, SuperPointModel
def get_superpoint_config():
config = SuperPointConfig(
encoder_hidden_sizes=[64, 64, 128, 128],
decoder_hidden_size=256,
keypoint_decoder_dim=65,
descriptor_decoder_dim=256,
keypoint_threshold=0.005,
max_keypoints=-1,
nms_radius=4,
border_removal_distance=4,
initializer_range=0.02,
)
return config
def create_rename_keys(config, state_dict):
rename_keys = []
# Encoder weights
rename_keys.append(("conv1a.weight", "encoder.conv_blocks.0.conv_a.weight"))
rename_keys.append(("conv1b.weight", "encoder.conv_blocks.0.conv_b.weight"))
rename_keys.append(("conv2a.weight", "encoder.conv_blocks.1.conv_a.weight"))
rename_keys.append(("conv2b.weight", "encoder.conv_blocks.1.conv_b.weight"))
rename_keys.append(("conv3a.weight", "encoder.conv_blocks.2.conv_a.weight"))
rename_keys.append(("conv3b.weight", "encoder.conv_blocks.2.conv_b.weight"))
rename_keys.append(("conv4a.weight", "encoder.conv_blocks.3.conv_a.weight"))
rename_keys.append(("conv4b.weight", "encoder.conv_blocks.3.conv_b.weight"))
rename_keys.append(("conv1a.bias", "encoder.conv_blocks.0.conv_a.bias"))
rename_keys.append(("conv1b.bias", "encoder.conv_blocks.0.conv_b.bias"))
rename_keys.append(("conv2a.bias", "encoder.conv_blocks.1.conv_a.bias"))
rename_keys.append(("conv2b.bias", "encoder.conv_blocks.1.conv_b.bias"))
rename_keys.append(("conv3a.bias", "encoder.conv_blocks.2.conv_a.bias"))
rename_keys.append(("conv3b.bias", "encoder.conv_blocks.2.conv_b.bias"))
rename_keys.append(("conv4a.bias", "encoder.conv_blocks.3.conv_a.bias"))
rename_keys.append(("conv4b.bias", "encoder.conv_blocks.3.conv_b.bias"))
# Keypoint Decoder weights
rename_keys.append(("convPa.weight", "keypoint_decoder.conv_score_a.weight"))
rename_keys.append(("convPb.weight", "keypoint_decoder.conv_score_b.weight"))
rename_keys.append(("convPa.bias", "keypoint_decoder.conv_score_a.bias"))
rename_keys.append(("convPb.bias", "keypoint_decoder.conv_score_b.bias"))
# Descriptor Decoder weights
rename_keys.append(("convDa.weight", "descriptor_decoder.conv_descriptor_a.weight"))
rename_keys.append(("convDb.weight", "descriptor_decoder.conv_descriptor_b.weight"))
rename_keys.append(("convDa.bias", "descriptor_decoder.conv_descriptor_a.bias"))
rename_keys.append(("convDb.bias", "descriptor_decoder.conv_descriptor_b.bias"))
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_imgs():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im1 = Image.open(requests.get(url, stream=True).raw)
url = "http://images.cocodataset.org/test-stuff2017/000000004016.jpg"
im2 = Image.open(requests.get(url, stream=True).raw)
return [im1, im2]
@torch.no_grad()
def convert_superpoint_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub, test_mode=False):
"""
Copy/paste/tweak model's weights to our SuperPoint structure.
"""
print("Downloading original model from checkpoint...")
config = get_superpoint_config()
# load original state_dict from URL
original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)
print("Converting model parameters...")
# rename keys
rename_keys = create_rename_keys(config, original_state_dict)
new_state_dict = original_state_dict.copy()
for src, dest in rename_keys:
rename_key(new_state_dict, src, dest)
# Load HuggingFace model
model = SuperPointModel(config)
model.load_state_dict(new_state_dict)
model.eval()
print("Successfully loaded weights in the model")
# Check model outputs
preprocessor = SuperPointImageProcessor()
inputs = preprocessor(images=prepare_imgs(), return_tensors="pt")
outputs = model(**inputs)
# If test_mode is True, we check that the model outputs match the original results
if test_mode:
torch.count_nonzero(outputs.mask[0])
expected_keypoints_shape = (2, 830, 2)
expected_scores_shape = (2, 830)
expected_descriptors_shape = (2, 830, 256)
expected_keypoints_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]])
expected_scores_values = torch.tensor([0.0064, 0.0140, 0.0595, 0.0728, 0.5170, 0.0175, 0.1523, 0.2055, 0.0336])
expected_descriptors_value = torch.tensor(-0.1096)
assert outputs.keypoints.shape == expected_keypoints_shape
assert outputs.scores.shape == expected_scores_shape
assert outputs.descriptors.shape == expected_descriptors_shape
assert torch.allclose(outputs.keypoints[0, :3], expected_keypoints_values, atol=1e-3)
assert torch.allclose(outputs.scores[0, :9], expected_scores_values, atol=1e-3)
assert torch.allclose(outputs.descriptors[0, 0, 0], expected_descriptors_value, atol=1e-3)
print("Model outputs match the original results!")
if save_model:
print("Saving model to local...")
# Create folder to save model
if not os.path.isdir(pytorch_dump_folder_path):
os.mkdir(pytorch_dump_folder_path)
model.save_pretrained(pytorch_dump_folder_path)
preprocessor.save_pretrained(pytorch_dump_folder_path)
model_name = "superpoint"
if push_to_hub:
print(f"Pushing {model_name} to the hub...")
model.push_to_hub(model_name)
preprocessor.push_to_hub(model_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--checkpoint_url",
default="https://github.com/magicleap/SuperPointPretrainedNetwork/raw/master/superpoint_v1.pth",
type=str,
help="URL of the original SuperPoint checkpoint you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="model",
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--save_model", action="store_true", help="Save model to local")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
args = parser.parse_args()
convert_superpoint_checkpoint(
args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
)
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for SuperPoint."""
from typing import Dict, Optional, Union
import numpy as np
from ... import is_vision_available, requires_backends
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
ImageInput,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import TensorType, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
def is_grayscale(
image: ImageInput,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
if input_data_format == ChannelDimension.FIRST:
return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
elif input_data_format == ChannelDimension.LAST:
return np.all(image[..., 0] == image[..., 1]) and np.all(image[..., 1] == image[..., 2])
def convert_to_grayscale(
image: ImageInput,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> ImageInput:
"""
Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. TODO support torch
and tensorflow grayscale conversion
This function is supposed to return a 1-channel image, but it returns a 3-channel image with the same value in each
channel, because of an issue that is discussed in :
https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446
Args:
image (Image):
The image to convert.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image.
"""
requires_backends(convert_to_grayscale, ["vision"])
if isinstance(image, np.ndarray):
if input_data_format == ChannelDimension.FIRST:
gray_image = image[0, ...] * 0.2989 + image[1, ...] * 0.5870 + image[2, ...] * 0.1140
gray_image = np.stack([gray_image] * 3, axis=0)
elif input_data_format == ChannelDimension.LAST:
gray_image = image[..., 0] * 0.2989 + image[..., 1] * 0.5870 + image[..., 2] * 0.1140
gray_image = np.stack([gray_image] * 3, axis=-1)
return gray_image
if not isinstance(image, PIL.Image.Image):
return image
image = image.convert("L")
return image
class SuperPointImageProcessor(BaseImageProcessor):
r"""
Constructs a SuperPoint image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
by `do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to
`True`. Can be overriden by `size` in the `preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 480, "width": 640}
size = get_size_dict(size, default_to_square=False)
self.do_resize = do_resize
self.size = size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Resize an image.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary of the form `{"height": int, "width": int}`, specifying the size of the output image.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the output image. If not provided, it will be inferred from the input
image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
size = get_size_dict(size, default_to_square=False)
return resize(
image,
size=(size["height"], size["width"]),
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
images,
do_resize: bool = None,
size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> BatchFeature:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the output image after `resize` has been applied. If `size["shortest_edge"]` >= 384, the image
is resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the
image will be matched to `int(size["shortest_edge"]/ crop_pct)`, after which the image is cropped to
`(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if do_resize:
images = [self.resize(image=image, size=size, input_data_format=input_data_format) for image in images]
if do_rescale:
images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
# Checking if image is RGB or grayscale
for i in range(len(images)):
if not is_grayscale(images[i], input_data_format):
images[i] = convert_to_grayscale(images[i], input_data_format=input_data_format)
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
This diff is collapsed.
......@@ -8026,6 +8026,23 @@ class Starcoder2PreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class SuperPointModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SuperPointPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -485,6 +485,13 @@ class SiglipImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class SuperPointImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class Swin2SRImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
......
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
from ...test_image_processing_common import (
ImageProcessingTestMixin,
prepare_image_inputs,
)
if is_vision_available():
from transformers import SuperPointImageProcessor
class SuperPointImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
):
size = size if size is not None else {"height": 480, "width": 640}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class SuperPointImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = SuperPointImageProcessor if is_vision_available() else None
def setUp(self) -> None:
self.image_processor_tester = SuperPointImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processing(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 480, "width": 640})
image_processor = self.image_processing_class.from_dict(
self.image_processor_dict, size={"height": 42, "width": 42}
)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
@unittest.skip(reason="SuperPointImageProcessor is always supposed to return a grayscaled image")
def test_call_numpy_4_channels(self):
pass
def test_input_image_properly_converted_to_grayscale(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs()
pre_processed_images = image_processor.preprocess(image_inputs)
for image in pre_processed_images["pixel_values"]:
self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]))
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import unittest
from typing import List
from transformers.models.superpoint.configuration_superpoint import SuperPointConfig
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor
if is_torch_available():
import torch
from transformers import (
SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST,
SuperPointModel,
)
if is_vision_available():
from PIL import Image
from transformers import AutoImageProcessor
class SuperPointModelTester:
def __init__(
self,
parent,
batch_size=3,
image_width=80,
image_height=60,
encoder_hidden_sizes: List[int] = [32, 32, 64, 64],
decoder_hidden_size: int = 128,
keypoint_decoder_dim: int = 65,
descriptor_decoder_dim: int = 128,
keypoint_threshold: float = 0.005,
max_keypoints: int = -1,
nms_radius: int = 4,
border_removal_distance: int = 4,
):
self.parent = parent
self.batch_size = batch_size
self.image_width = image_width
self.image_height = image_height
self.encoder_hidden_sizes = encoder_hidden_sizes
self.decoder_hidden_size = decoder_hidden_size
self.keypoint_decoder_dim = keypoint_decoder_dim
self.descriptor_decoder_dim = descriptor_decoder_dim
self.keypoint_threshold = keypoint_threshold
self.max_keypoints = max_keypoints
self.nms_radius = nms_radius
self.border_removal_distance = border_removal_distance
def prepare_config_and_inputs(self):
# SuperPoint expects a grayscale image as input
pixel_values = floats_tensor([self.batch_size, 3, self.image_height, self.image_width])
config = self.get_config()
return config, pixel_values
def get_config(self):
return SuperPointConfig(
encoder_hidden_sizes=self.encoder_hidden_sizes,
decoder_hidden_size=self.decoder_hidden_size,
keypoint_decoder_dim=self.keypoint_decoder_dim,
descriptor_decoder_dim=self.descriptor_decoder_dim,
keypoint_threshold=self.keypoint_threshold,
max_keypoints=self.max_keypoints,
nms_radius=self.nms_radius,
border_removal_distance=self.border_removal_distance,
)
def create_and_check_model(self, config, pixel_values):
model = SuperPointModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(
result.last_hidden_state.shape,
(
self.batch_size,
self.encoder_hidden_sizes[-1],
self.image_height // 8,
self.image_width // 8,
),
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class SuperPointModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (SuperPointModel,) if is_torch_available() else ()
all_generative_model_classes = () if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
has_attentions = False
def setUp(self):
self.model_tester = SuperPointModelTester(self)
self.config_tester = ConfigTester(self, config_class=SuperPointConfig, has_text_modality=False, hidden_size=37)
def test_config(self):
self.create_and_test_config_common_properties()
self.config_tester.create_and_test_config_to_json_string()
self.config_tester.create_and_test_config_to_json_file()
self.config_tester.create_and_test_config_from_and_save_pretrained()
self.config_tester.create_and_test_config_with_num_labels()
self.config_tester.check_config_can_be_init_without_params()
self.config_tester.check_config_arguments_init()
def create_and_test_config_common_properties(self):
return
@unittest.skip(reason="SuperPointModel does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="SuperPointModel does not support input and output embeddings")
def test_model_common_attributes(self):
pass
@unittest.skip(reason="SuperPointModel does not use feedforward chunking")
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="SuperPointModel is not trainable")
def test_training(self):
pass
@unittest.skip(reason="SuperPointModel is not trainable")
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(reason="SuperPointModel is not trainable")
def test_training_gradient_checkpointing_use_reentrant(self):
pass
@unittest.skip(reason="SuperPointModel is not trainable")
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="SuperPoint does not output any loss term in the forward pass")
def test_retain_grad_hidden_states_attentions(self):
pass
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
# SuperPoint's feature maps are of shape (batch_size, num_channels, width, height)
for i, conv_layer_size in enumerate(self.model_tester.encoder_hidden_sizes[:-1]):
self.assertListEqual(
list(hidden_states[i].shape[-3:]),
[
conv_layer_size,
self.model_tester.image_height // (2 ** (i + 1)),
self.model_tester.image_width // (2 ** (i + 1)),
],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@slow
def test_model_from_pretrained(self):
for model_name in SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = SuperPointModel.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_forward_labels_should_be_none(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
model_inputs = self._prepare_for_class(inputs_dict, model_class)
# Provide an arbitrary sized Tensor as labels to model inputs
model_inputs["labels"] = torch.rand((128, 128))
with self.assertRaises(ValueError) as cm:
model(**model_inputs)
self.assertEqual(ValueError, cm.exception.__class__)
def prepare_imgs():
image1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")
return [image1, image2]
@require_torch
@require_vision
class SuperPointModelIntegrationTest(unittest.TestCase):
@cached_property
def default_image_processor(self):
return AutoImageProcessor.from_pretrained("magic-leap-community/superpoint") if is_vision_available() else None
@slow
def test_inference(self):
model = SuperPointModel.from_pretrained("magic-leap-community/superpoint").to(torch_device)
preprocessor = self.default_image_processor
images = prepare_imgs()
inputs = preprocessor(images=images, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
expected_number_keypoints_image0 = 567
expected_number_keypoints_image1 = 830
expected_max_number_keypoints = max(expected_number_keypoints_image0, expected_number_keypoints_image1)
expected_keypoints_shape = torch.Size((len(images), expected_max_number_keypoints, 2))
expected_scores_shape = torch.Size(
(
len(images),
expected_max_number_keypoints,
)
)
expected_descriptors_shape = torch.Size((len(images), expected_max_number_keypoints, 256))
# Check output shapes
self.assertEqual(outputs.keypoints.shape, expected_keypoints_shape)
self.assertEqual(outputs.scores.shape, expected_scores_shape)
self.assertEqual(outputs.descriptors.shape, expected_descriptors_shape)
expected_keypoints_image0_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]]).to(torch_device)
expected_scores_image0_values = torch.tensor(
[0.0064, 0.0137, 0.0589, 0.0723, 0.5166, 0.0174, 0.1515, 0.2054, 0.0334]
).to(torch_device)
expected_descriptors_image0_value = torch.tensor(-0.1096).to(torch_device)
predicted_keypoints_image0_values = outputs.keypoints[0, :3]
predicted_scores_image0_values = outputs.scores[0, :9]
predicted_descriptors_image0_value = outputs.descriptors[0, 0, 0]
# Check output values
self.assertTrue(
torch.allclose(
predicted_keypoints_image0_values,
expected_keypoints_image0_values,
atol=1e-4,
)
)
self.assertTrue(torch.allclose(predicted_scores_image0_values, expected_scores_image0_values, atol=1e-4))
self.assertTrue(
torch.allclose(
predicted_descriptors_image0_value,
expected_descriptors_image0_value,
atol=1e-4,
)
)
# Check mask values
self.assertTrue(outputs.mask[0, expected_number_keypoints_image0 - 1].item() == 1)
self.assertTrue(outputs.mask[0, expected_number_keypoints_image0].item() == 0)
self.assertTrue(torch.all(outputs.mask[0, : expected_number_keypoints_image0 - 1]))
self.assertTrue(torch.all(torch.logical_not(outputs.mask[0, expected_number_keypoints_image0:])))
self.assertTrue(torch.all(outputs.mask[1]))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment