Unverified Commit 2422c38d authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add DINOv2 depth estimation (#26092)



* First draft

* Fix style

* More improvements

* Fix tests

* Fix tests

* Convert checkpoint

* Improve DPTImageProcessor

* Remove scripts, improve conversion script

* Remove print statements

* Fix test

* Improve docstring

* More improvements

* Fix style

* Fix image processor

* Add tests

* Address comments

* Address comments

* Make bias backwards compatible

* Address comment

* Address comment

* Address comment

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Address comments

* Add flag

* Add tests

* Make tests smaller

* Use regular BackboneOutput

* Fix all tests

* Update test

* Convert more checkpoints

* Convert giant checkpoints, add integration test

* Rename size_divisibility to size_divisor

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 3b596213
...@@ -835,11 +835,12 @@ class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin): ...@@ -835,11 +835,12 @@ class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
if self.config.apply_layernorm: if self.config.apply_layernorm:
hidden_state = self.layernorm(hidden_state) hidden_state = self.layernorm(hidden_state)
if self.config.reshape_hidden_states: if self.config.reshape_hidden_states:
hidden_state = hidden_state[:, 1:]
# this was actually a bug in the original implementation that we copied here,
# cause normally the order is height, width
batch_size, _, height, width = pixel_values.shape batch_size, _, height, width = pixel_values.shape
patch_size = self.config.patch_size patch_size = self.config.patch_size
hidden_state = hidden_state[:, 1:, :].reshape( hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
batch_size, width // patch_size, height // patch_size, -1
)
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
feature_maps += (hidden_state,) feature_maps += (hidden_state,)
......
...@@ -18,6 +18,7 @@ import copy ...@@ -18,6 +18,7 @@ import copy
from ...configuration_utils import PretrainedConfig from ...configuration_utils import PretrainedConfig
from ...utils import logging from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING
from ..bit import BitConfig from ..bit import BitConfig
...@@ -91,6 +92,10 @@ class DPTConfig(PretrainedConfig): ...@@ -91,6 +92,10 @@ class DPTConfig(PretrainedConfig):
The index of the features to use in the heads. The index of the features to use in the heads.
use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
Whether to use batch normalization in the pre-activate residual units of the fusion blocks. Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
Whether to use bias in the pre-activate residual units of the fusion blocks.
add_projection (`bool`, *optional*, defaults to `False`):
Whether to add a projection layer before the depth estimation head.
use_auxiliary_head (`bool`, *optional*, defaults to `True`): use_auxiliary_head (`bool`, *optional*, defaults to `True`):
Whether to use an auxiliary head during training. Whether to use an auxiliary head during training.
auxiliary_loss_weight (`float`, *optional*, defaults to 0.4): auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
...@@ -104,7 +109,8 @@ class DPTConfig(PretrainedConfig): ...@@ -104,7 +109,8 @@ class DPTConfig(PretrainedConfig):
neck_ignore_stages (`List[int]`, *optional*, defaults to `[0, 1]`): neck_ignore_stages (`List[int]`, *optional*, defaults to `[0, 1]`):
Used only for the `hybrid` embedding type. The stages of the readout layers to ignore. Used only for the `hybrid` embedding type. The stages of the readout layers to ignore.
backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
Used only for the `hybrid` embedding type. The configuration of the backbone in a dictionary. The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
leverage the [`AutoBackbone`] API.
Example: Example:
...@@ -145,6 +151,8 @@ class DPTConfig(PretrainedConfig): ...@@ -145,6 +151,8 @@ class DPTConfig(PretrainedConfig):
fusion_hidden_size=256, fusion_hidden_size=256,
head_in_index=-1, head_in_index=-1,
use_batch_norm_in_fusion_residual=False, use_batch_norm_in_fusion_residual=False,
use_bias_in_fusion_residual=None,
add_projection=False,
use_auxiliary_head=True, use_auxiliary_head=True,
auxiliary_loss_weight=0.4, auxiliary_loss_weight=0.4,
semantic_loss_ignore_index=255, semantic_loss_ignore_index=255,
...@@ -159,6 +167,7 @@ class DPTConfig(PretrainedConfig): ...@@ -159,6 +167,7 @@ class DPTConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.is_hybrid = is_hybrid self.is_hybrid = is_hybrid
use_autobackbone = False
if self.is_hybrid: if self.is_hybrid:
if backbone_config is None: if backbone_config is None:
logger.info("Initializing the config with a `BiT` backbone.") logger.info("Initializing the config with a `BiT` backbone.")
...@@ -185,32 +194,49 @@ class DPTConfig(PretrainedConfig): ...@@ -185,32 +194,49 @@ class DPTConfig(PretrainedConfig):
if readout_type != "project": if readout_type != "project":
raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.") raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.")
elif backbone_config is not None:
use_autobackbone = True
if isinstance(backbone_config, dict):
backbone_model_type = backbone_config.get("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
self.backbone_config = backbone_config
self.backbone_featmap_shape = None
self.neck_ignore_stages = []
else: else:
self.backbone_config = None self.backbone_config = backbone_config
self.backbone_featmap_shape = None self.backbone_featmap_shape = None
self.neck_ignore_stages = [] self.neck_ignore_stages = []
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = None if use_autobackbone else num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = None if use_autobackbone else num_attention_heads
self.intermediate_size = intermediate_size self.intermediate_size = None if use_autobackbone else intermediate_size
self.hidden_act = hidden_act self.hidden_dropout_prob = None if use_autobackbone else hidden_dropout_prob
self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = None if use_autobackbone else attention_probs_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob self.layer_norm_eps = None if use_autobackbone else layer_norm_eps
self.initializer_range = initializer_range self.image_size = None if use_autobackbone else image_size
self.layer_norm_eps = layer_norm_eps self.patch_size = None if use_autobackbone else patch_size
self.image_size = image_size self.num_channels = None if use_autobackbone else num_channels
self.patch_size = patch_size self.qkv_bias = None if use_autobackbone else qkv_bias
self.num_channels = num_channels self.backbone_out_indices = None if use_autobackbone else backbone_out_indices
self.qkv_bias = qkv_bias
self.backbone_out_indices = backbone_out_indices
if readout_type not in ["ignore", "add", "project"]: if readout_type not in ["ignore", "add", "project"]:
raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']") raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.readout_type = readout_type self.readout_type = readout_type
self.reassemble_factors = reassemble_factors self.reassemble_factors = reassemble_factors
self.neck_hidden_sizes = neck_hidden_sizes self.neck_hidden_sizes = neck_hidden_sizes
self.fusion_hidden_size = fusion_hidden_size self.fusion_hidden_size = fusion_hidden_size
self.head_in_index = head_in_index self.head_in_index = head_in_index
self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
self.add_projection = add_projection
# auxiliary head attributes (semantic segmentation) # auxiliary head attributes (semantic segmentation)
self.use_auxiliary_head = use_auxiliary_head self.use_auxiliary_head = use_auxiliary_head
self.auxiliary_loss_weight = auxiliary_loss_weight self.auxiliary_loss_weight = auxiliary_loss_weight
......
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DINOv2 + DPT checkpoints from the original repository. URL:
https://github.com/facebookresearch/dinov2/tree/main"""
import argparse
import itertools
import math
from pathlib import Path
import requests
import torch
from PIL import Image
from torchvision import transforms
from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_dpt_config(model_name):
if "small" in model_name:
# equivalent to stage 3, stage 6, stage 9, stage 12
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [48, 96, 192, 384]
elif "base" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [96, 192, 384, 768]
elif "large" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [128, 256, 512, 1024]
elif "giant" in model_name:
backbone_config = Dinov2Config.from_pretrained(
"facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False
)
neck_hidden_sizes = [192, 384, 768, 1536]
else:
raise NotImplementedError("To do")
config = DPTConfig(
backbone_config=backbone_config,
neck_hidden_sizes=neck_hidden_sizes,
use_bias_in_fusion_residual=False,
add_projection=True,
)
return config
# here we list all DPT keys to be renamed (original name on the left, our name on the right)
def create_rename_keys_dpt(config):
rename_keys = []
# fmt: off
# activation postprocessing (projections, readout projections + resize blocks)
for i in range(4):
rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
if i != 2:
rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
# fusion layers
for i in range(4):
rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias"))
if i != 0:
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight"))
rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight"))
# neck convolutions
for i in range(4):
rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight"))
# head
rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight"))
rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias"))
for i in range(0, 5, 2):
rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight"))
rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias"))
# fmt: on
return rename_keys
# here we list all backbone keys to be renamed (original name on the left, our name on the right)
def create_rename_keys_backbone(config):
rename_keys = []
# fmt: off
# patch embedding layer
rename_keys.append(("cls_token", "backbone.embeddings.cls_token"))
rename_keys.append(("mask_token", "backbone.embeddings.mask_token"))
rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings"))
rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
# Transfomer encoder
for i in range(config.backbone_config.num_hidden_layers):
# layernorms
rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
# MLP
if config.backbone_config.use_swiglu_ffn:
rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight"))
rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias"))
rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight"))
rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias"))
else:
rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
# layerscale
rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
# attention projection layer
rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
# fmt: on
rename_keys.append(("norm.weight", "backbone.layernorm.weight"))
rename_keys.append(("norm.bias", "backbone.layernorm.bias"))
return rename_keys
# we split up the matrix of each encoder layer into queries, keys and values
def read_in_q_k_v(state_dict, config):
for i in range(config.backbone_config.num_hidden_layers):
# read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
hidden_size = config.backbone_config.hidden_size
# next, add query, keys and values (in that order) to the state dict
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
hidden_size : hidden_size * 2
]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
# We will verify our results on an image of cute cats
def prepare_img():
url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
name_to_url = {
"dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth",
"dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth",
"dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth",
"dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth",
"dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth",
"dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth",
"dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth",
"dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth",
}
def get_original_pixel_values(image):
class CenterPadding(object):
def __init__(self, multiple):
super().__init__()
self.multiple = multiple
def _get_pad(self, size):
new_size = math.ceil(size / self.multiple) * self.multiple
pad_size = new_size - size
pad_size_left = pad_size // 2
pad_size_right = pad_size - pad_size_left
return pad_size_left, pad_size_right
def __call__(self, img):
pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1]))
output = torch.nn.functional.pad(img, pads)
return output
def __repr__(self):
return self.__class__.__name__ + "()"
def make_depth_transform() -> transforms.Compose:
return transforms.Compose(
[
transforms.ToTensor(),
lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255
transforms.Normalize(
mean=(123.675, 116.28, 103.53),
std=(58.395, 57.12, 57.375),
),
CenterPadding(multiple=14),
]
)
transform = make_depth_transform()
original_pixel_values = transform(image).unsqueeze(0)
return original_pixel_values
@torch.no_grad()
def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
"""
Copy/paste/tweak model's weights to our DPT structure.
"""
# define DPT configuration based on URL
checkpoint_url = name_to_url[model_name]
config = get_dpt_config(model_name)
# load original DPT state_dict from URL
print("URL:", checkpoint_url)
dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
# rename keys
rename_keys = create_rename_keys_dpt(config)
for src, dest in rename_keys:
rename_key(dpt_state_dict, src, dest)
# load original backbone state_dict from URL
if "small" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
elif "base" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
elif "large" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14")
elif "giant" in model_name:
original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14")
else:
raise NotImplementedError("To do")
original_model.eval()
backbone_state_dict = original_model.state_dict()
# rename keys
rename_keys = create_rename_keys_backbone(config)
for src, dest in rename_keys:
rename_key(backbone_state_dict, src, dest)
# read in qkv matrices
read_in_q_k_v(backbone_state_dict, config)
for key, val in backbone_state_dict.copy().items():
val = backbone_state_dict.pop(key)
if "w12" in key:
key = key.replace("w12", "weights_in")
if "w3" in key:
key = key.replace("w3", "weights_out")
backbone_state_dict[key] = val
# merge state_dicts
state_dict = {**backbone_state_dict, **dpt_state_dict}
# load HuggingFace model
model = DPTForDepthEstimation(config)
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
assert missing_keys == [
"neck.fusion_stage.layers.0.residual_layer1.convolution1.weight",
"neck.fusion_stage.layers.0.residual_layer1.convolution2.weight",
]
model.eval()
# Verify image processor
processor = DPTImageProcessor(
do_resize=False,
do_rescale=False,
do_pad=True,
size_divisor=14,
do_normalize=True,
image_mean=(123.675, 116.28, 103.53),
image_std=(58.395, 57.12, 57.375),
)
image = prepare_img()
pixel_values = processor(image, return_tensors="pt").pixel_values.float()
original_pixel_values = get_original_pixel_values(image)
assert torch.allclose(pixel_values, original_pixel_values)
# Verify forward pass
with torch.no_grad():
outputs = model(pixel_values)
predicted_depth = outputs.predicted_depth
print("Shape of predicted depth:", predicted_depth.shape)
print("First values of predicted depth:", predicted_depth[0, :3, :3])
# assert logits
if verify_logits:
if model_name == "dpt-dinov2-small-nyu":
expected_shape = torch.Size([1, 576, 736])
expected_slice = torch.tensor(
[[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]]
)
assert predicted_depth.shape == torch.Size(expected_shape)
assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(repo_id=f"facebook/{model_name}")
processor.push_to_hub(repo_id=f"facebook/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_name",
default="dpt-dinov2-small-nyu",
type=str,
choices=name_to_url.keys(),
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model to the hub after conversion.",
)
parser.add_argument(
"--verify_logits",
action="store_true",
required=False,
help="Path to the output PyTorch model directory.",
)
args = parser.parse_args()
convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
...@@ -229,12 +229,14 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub ...@@ -229,12 +229,14 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
if "ade" in checkpoint_url if "ade" in checkpoint_url
else torch.allclose(outputs[0, :3, :3], expected_slice) else torch.allclose(outputs[0, :3, :3], expected_slice)
) )
print("Looks ok!")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) if pytorch_dump_folder_path is not None:
print(f"Saving model to {pytorch_dump_folder_path}") Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path) print(f"Saving model to {pytorch_dump_folder_path}")
print(f"Saving image processor to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path) print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub: if push_to_hub:
print("Pushing model to hub...") print("Pushing model to hub...")
...@@ -265,7 +267,7 @@ if __name__ == "__main__": ...@@ -265,7 +267,7 @@ if __name__ == "__main__":
"--pytorch_dump_folder_path", "--pytorch_dump_folder_path",
default=None, default=None,
type=str, type=str,
required=True, required=False,
help="Path to the output PyTorch model directory.", help="Path to the output PyTorch model directory.",
) )
parser.add_argument( parser.add_argument(
...@@ -276,6 +278,7 @@ if __name__ == "__main__": ...@@ -276,6 +278,7 @@ if __name__ == "__main__":
"--model_name", "--model_name",
default="dpt-large", default="dpt-large",
type=str, type=str,
required=False,
help="Name of the model, in case you're pushing to the hub.", help="Name of the model, in case you're pushing to the hub.",
) )
......
...@@ -20,7 +20,7 @@ from typing import Dict, Iterable, List, Optional, Tuple, Union ...@@ -20,7 +20,7 @@ from typing import Dict, Iterable, List, Optional, Tuple, Union
import numpy as np import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format from ...image_transforms import pad, resize, to_channel_dimension_format
from ...image_utils import ( from ...image_utils import (
IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD, IMAGENET_STANDARD_STD,
...@@ -122,6 +122,12 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -122,6 +122,12 @@ class DPTImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `False`):
Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
combination with DPT.
size_divisor (`int`, *optional*):
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
DINOv2 paper, which uses the model in combination with DPT.
""" """
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
...@@ -138,6 +144,8 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -138,6 +144,8 @@ class DPTImageProcessor(BaseImageProcessor):
do_normalize: bool = True, do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = False,
size_divisor: int = None,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -153,6 +161,8 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -153,6 +161,8 @@ class DPTImageProcessor(BaseImageProcessor):
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.do_pad = do_pad
self.size_divisor = size_divisor
def resize( def resize(
self, self,
...@@ -208,6 +218,51 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -208,6 +218,51 @@ class DPTImageProcessor(BaseImageProcessor):
**kwargs, **kwargs,
) )
def pad_image(
self,
image: np.array,
size_divisor: int,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""
Center pad an image to be a multiple of `multiple`.
Args:
image (`np.ndarray`):
Image to pad.
size_divisor (`int`):
The width and height of the image will be padded to a multiple of this number.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
def _get_pad(size, size_divisor):
new_size = math.ceil(size / size_divisor) * size_divisor
pad_size = new_size - size
pad_size_left = pad_size // 2
pad_size_right = pad_size - pad_size_left
return pad_size_left, pad_size_right
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
height, width = get_image_size(image, input_data_format)
pad_size_left, pad_size_right = _get_pad(height, size_divisor)
pad_size_top, pad_size_bottom = _get_pad(width, size_divisor)
return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format)
def preprocess( def preprocess(
self, self,
images: ImageInput, images: ImageInput,
...@@ -221,6 +276,8 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -221,6 +276,8 @@ class DPTImageProcessor(BaseImageProcessor):
do_normalize: bool = None, do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = None,
size_divisor: int = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST, data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
...@@ -286,6 +343,8 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -286,6 +343,8 @@ class DPTImageProcessor(BaseImageProcessor):
do_normalize = do_normalize if do_normalize is not None else self.do_normalize do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std image_std = image_std if image_std is not None else self.image_std
do_pad = do_pad if do_pad is not None else self.do_pad
size_divisor = size_divisor if size_divisor is not None else self.size_divisor
images = make_list_of_images(images) images = make_list_of_images(images)
...@@ -304,6 +363,9 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -304,6 +363,9 @@ class DPTImageProcessor(BaseImageProcessor):
if do_normalize and (image_mean is None or image_std is None): if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.") raise ValueError("Image mean and std must be specified if do_normalize is True.")
if do_pad and size_divisor is None:
raise ValueError("Size divisibility must be specified if do_pad is True.")
# All transformations expect numpy arrays. # All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images] images = [to_numpy_array(image) for image in images]
...@@ -335,6 +397,12 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -335,6 +397,12 @@ class DPTImageProcessor(BaseImageProcessor):
for image in images for image in images
] ]
if do_pad:
images = [
self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format)
for image in images
]
images = [ images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
] ]
......
...@@ -599,12 +599,13 @@ class DPTReassembleStage(nn.Module): ...@@ -599,12 +599,13 @@ class DPTReassembleStage(nn.Module):
# When using DPT-Hybrid the readout type is set to "project". The sanity check is done on the config file # When using DPT-Hybrid the readout type is set to "project". The sanity check is done on the config file
self.readout_projects = nn.ModuleList() self.readout_projects = nn.ModuleList()
hidden_size = _get_backbone_hidden_size(config)
for i in range(len(config.neck_hidden_sizes)): for i in range(len(config.neck_hidden_sizes)):
if i <= 1: if i <= 1:
self.readout_projects.append(nn.Sequential(nn.Identity())) self.readout_projects.append(nn.Sequential(nn.Identity()))
elif i > 1: elif i > 1:
self.readout_projects.append( self.readout_projects.append(
nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act]) nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
) )
def _init_reassemble_dpt(self, config): def _init_reassemble_dpt(self, config):
...@@ -613,12 +614,13 @@ class DPTReassembleStage(nn.Module): ...@@ -613,12 +614,13 @@ class DPTReassembleStage(nn.Module):
if config.readout_type == "project": if config.readout_type == "project":
self.readout_projects = nn.ModuleList() self.readout_projects = nn.ModuleList()
hidden_size = _get_backbone_hidden_size(config)
for _ in range(len(config.neck_hidden_sizes)): for _ in range(len(config.neck_hidden_sizes)):
self.readout_projects.append( self.readout_projects.append(
nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act]) nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
) )
def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
""" """
Args: Args:
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
...@@ -628,21 +630,24 @@ class DPTReassembleStage(nn.Module): ...@@ -628,21 +630,24 @@ class DPTReassembleStage(nn.Module):
for i, hidden_state in enumerate(hidden_states): for i, hidden_state in enumerate(hidden_states):
if i not in self.neck_ignore_stages: if i not in self.neck_ignore_stages:
# reshape to (B, C, H, W) # reshape to (batch_size, num_channels, height, width)
hidden_state, cls_token = hidden_state[:, 1:], hidden_state[:, 0] cls_token, hidden_state = hidden_state[:, 0], hidden_state[:, 1:]
batch_size, sequence_length, num_channels = hidden_state.shape batch_size, sequence_length, num_channels = hidden_state.shape
size = int(math.sqrt(sequence_length)) if patch_height is not None and patch_width is not None:
hidden_state = hidden_state.reshape(batch_size, size, size, num_channels) hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
else:
size = int(math.sqrt(sequence_length))
hidden_state = hidden_state.reshape(batch_size, size, size, num_channels)
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
feature_shape = hidden_state.shape feature_shape = hidden_state.shape
if self.config.readout_type == "project": if self.config.readout_type == "project":
# reshape to (B, H*W, C) # reshape to (batch_size, height*width, num_channels)
hidden_state = hidden_state.flatten(2).permute((0, 2, 1)) hidden_state = hidden_state.flatten(2).permute((0, 2, 1))
readout = cls_token.unsqueeze(1).expand_as(hidden_state) readout = cls_token.unsqueeze(1).expand_as(hidden_state)
# concatenate the readout token to the hidden states and project # concatenate the readout token to the hidden states and project
hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1)) hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1))
# reshape back to (B, C, H, W) # reshape back to (batch_size, num_channels, height, width)
hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape) hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape)
elif self.config.readout_type == "add": elif self.config.readout_type == "add":
hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1) hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
...@@ -653,11 +658,19 @@ class DPTReassembleStage(nn.Module): ...@@ -653,11 +658,19 @@ class DPTReassembleStage(nn.Module):
return out return out
def _get_backbone_hidden_size(config):
if config.backbone_config is not None and config.is_hybrid is False:
return config.backbone_config.hidden_size
else:
return config.hidden_size
class DPTReassembleLayer(nn.Module): class DPTReassembleLayer(nn.Module):
def __init__(self, config, channels, factor): def __init__(self, config, channels, factor):
super().__init__() super().__init__()
# projection # projection
self.projection = nn.Conv2d(in_channels=config.hidden_size, out_channels=channels, kernel_size=1) hidden_size = _get_backbone_hidden_size(config)
self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)
# up/down sampling depending on factor # up/down sampling depending on factor
if factor > 1: if factor > 1:
...@@ -710,24 +723,30 @@ class DPTPreActResidualLayer(nn.Module): ...@@ -710,24 +723,30 @@ class DPTPreActResidualLayer(nn.Module):
super().__init__() super().__init__()
self.use_batch_norm = config.use_batch_norm_in_fusion_residual self.use_batch_norm = config.use_batch_norm_in_fusion_residual
self.activation1 = ACT2FN["relu"] use_bias_in_fusion_residual = (
config.use_bias_in_fusion_residual
if config.use_bias_in_fusion_residual is not None
else not self.use_batch_norm
)
self.activation1 = nn.ReLU()
self.convolution1 = nn.Conv2d( self.convolution1 = nn.Conv2d(
config.fusion_hidden_size, config.fusion_hidden_size,
config.fusion_hidden_size, config.fusion_hidden_size,
kernel_size=3, kernel_size=3,
stride=1, stride=1,
padding=1, padding=1,
bias=not self.use_batch_norm, bias=use_bias_in_fusion_residual,
) )
self.activation2 = ACT2FN["relu"] self.activation2 = nn.ReLU()
self.convolution2 = nn.Conv2d( self.convolution2 = nn.Conv2d(
config.fusion_hidden_size, config.fusion_hidden_size,
config.fusion_hidden_size, config.fusion_hidden_size,
kernel_size=3, kernel_size=3,
stride=1, stride=1,
padding=1, padding=1,
bias=not self.use_batch_norm, bias=use_bias_in_fusion_residual,
) )
if self.use_batch_norm: if self.use_batch_norm:
...@@ -973,8 +992,12 @@ class DPTNeck(nn.Module): ...@@ -973,8 +992,12 @@ class DPTNeck(nn.Module):
super().__init__() super().__init__()
self.config = config self.config = config
# postprocessing # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT)
self.reassemble_stage = DPTReassembleStage(config) if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]:
self.reassemble_stage = None
else:
self.reassemble_stage = DPTReassembleStage(config)
self.convs = nn.ModuleList() self.convs = nn.ModuleList()
for channel in config.neck_hidden_sizes: for channel in config.neck_hidden_sizes:
self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
...@@ -982,17 +1005,23 @@ class DPTNeck(nn.Module): ...@@ -982,17 +1005,23 @@ class DPTNeck(nn.Module):
# fusion # fusion
self.fusion_stage = DPTFeatureFusionStage(config) self.fusion_stage = DPTFeatureFusionStage(config)
def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
if not isinstance(hidden_states, list): """
raise ValueError("hidden_states should be a list of tensors") Args:
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
List of hidden states from the backbone.
"""
if not isinstance(hidden_states, (tuple, list)):
raise ValueError("hidden_states should be a tuple or list of tensors")
if len(hidden_states) != len(self.config.neck_hidden_sizes): if len(hidden_states) != len(self.config.neck_hidden_sizes):
raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
# postprocess hidden states # postprocess hidden states
features = self.reassemble_stage(hidden_states) if self.reassemble_stage is not None:
hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
features = [self.convs[i](feature) for i, feature in enumerate(features)] features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
# fusion blocks # fusion blocks
output = self.fusion_stage(features) output = self.fusion_stage(features)
...@@ -1012,20 +1041,28 @@ class DPTDepthEstimationHead(nn.Module): ...@@ -1012,20 +1041,28 @@ class DPTDepthEstimationHead(nn.Module):
self.config = config self.config = config
self.projection = None
if config.add_projection:
self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
features = config.fusion_hidden_size features = config.fusion_hidden_size
self.head = nn.Sequential( self.head = nn.Sequential(
nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
ACT2FN["relu"], nn.ReLU(),
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
ACT2FN["relu"], nn.ReLU(),
) )
def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
# use last features # use last features
hidden_states = hidden_states[self.config.head_in_index] hidden_states = hidden_states[self.config.head_in_index]
if self.projection is not None:
hidden_states = self.projection(hidden_states)
hidden_states = nn.ReLU()(hidden_states)
predicted_depth = self.head(hidden_states) predicted_depth = self.head(hidden_states)
predicted_depth = predicted_depth.squeeze(dim=1) predicted_depth = predicted_depth.squeeze(dim=1)
...@@ -1043,7 +1080,11 @@ class DPTForDepthEstimation(DPTPreTrainedModel): ...@@ -1043,7 +1080,11 @@ class DPTForDepthEstimation(DPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.dpt = DPTModel(config, add_pooling_layer=False) self.backbone = None
if config.backbone_config is not None and config.is_hybrid is False:
self.backbone = AutoBackbone.from_config(config.backbone_config)
else:
self.dpt = DPTModel(config, add_pooling_layer=False)
# Neck # Neck
self.neck = DPTNeck(config) self.neck = DPTNeck(config)
...@@ -1109,32 +1150,46 @@ class DPTForDepthEstimation(DPTPreTrainedModel): ...@@ -1109,32 +1150,46 @@ class DPTForDepthEstimation(DPTPreTrainedModel):
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
outputs = self.dpt( if self.backbone is not None:
pixel_values, outputs = self.backbone.forward_with_filtered_kwargs(
head_mask=head_mask, pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
output_attentions=output_attentions, )
output_hidden_states=True, # we need the intermediate hidden states hidden_states = outputs.feature_maps
return_dict=return_dict,
)
hidden_states = outputs.hidden_states if return_dict else outputs[1]
# only keep certain features based on config.backbone_out_indices
# note that the hidden_states also include the initial embeddings
if not self.config.is_hybrid:
hidden_states = [
feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
]
else: else:
backbone_hidden_states = outputs.intermediate_activations if return_dict else list(outputs[-1]) outputs = self.dpt(
backbone_hidden_states.extend( pixel_values,
feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:] head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=True, # we need the intermediate hidden states
return_dict=return_dict,
) )
hidden_states = outputs.hidden_states if return_dict else outputs[1]
# only keep certain features based on config.backbone_out_indices
# note that the hidden_states also include the initial embeddings
if not self.config.is_hybrid:
hidden_states = [
feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
]
else:
backbone_hidden_states = outputs.intermediate_activations if return_dict else list(outputs[-1])
backbone_hidden_states.extend(
feature
for idx, feature in enumerate(hidden_states[1:])
if idx in self.config.backbone_out_indices[2:]
)
hidden_states = backbone_hidden_states hidden_states = backbone_hidden_states
patch_height, patch_width = None, None
if self.config.backbone_config is not None and self.config.is_hybrid is False:
_, _, height, width = pixel_values.shape
patch_size = self.config.backbone_config.patch_size
patch_height = height // patch_size
patch_width = width // patch_size
hidden_states = self.neck(hidden_states) hidden_states = self.neck(hidden_states, patch_height, patch_width)
predicted_depth = self.head(hidden_states) predicted_depth = self.head(hidden_states)
...@@ -1167,7 +1222,7 @@ class DPTSemanticSegmentationHead(nn.Module): ...@@ -1167,7 +1222,7 @@ class DPTSemanticSegmentationHead(nn.Module):
self.head = nn.Sequential( self.head = nn.Sequential(
nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(features), nn.BatchNorm2d(features),
ACT2FN["relu"], nn.ReLU(),
nn.Dropout(config.semantic_classifier_dropout), nn.Dropout(config.semantic_classifier_dropout),
nn.Conv2d(features, config.num_labels, kernel_size=1), nn.Conv2d(features, config.num_labels, kernel_size=1),
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
...@@ -1190,7 +1245,7 @@ class DPTAuxiliaryHead(nn.Module): ...@@ -1190,7 +1245,7 @@ class DPTAuxiliaryHead(nn.Module):
self.head = nn.Sequential( self.head = nn.Sequential(
nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(features), nn.BatchNorm2d(features),
ACT2FN["relu"], nn.ReLU(),
nn.Dropout(0.1, False), nn.Dropout(0.1, False),
nn.Conv2d(features, config.num_labels, kernel_size=1), nn.Conv2d(features, config.num_labels, kernel_size=1),
) )
...@@ -1287,7 +1342,7 @@ class DPTForSemanticSegmentation(DPTPreTrainedModel): ...@@ -1287,7 +1342,7 @@ class DPTForSemanticSegmentation(DPTPreTrainedModel):
hidden_states = backbone_hidden_states hidden_states = backbone_hidden_states
hidden_states = self.neck(hidden_states) hidden_states = self.neck(hidden_states=hidden_states)
logits = self.head(hidden_states) logits = self.head(hidden_states)
......
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
import unittest import unittest
import numpy as np
from transformers.file_utils import is_vision_available from transformers.file_utils import is_vision_available
from transformers.testing_utils import require_torch, require_vision from transformers.testing_utils import require_torch, require_vision
...@@ -97,6 +99,10 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -97,6 +99,10 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize")) self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size")) self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "rescale_factor"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "size_divisor"))
def test_image_processor_from_dict_with_kwargs(self): def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict) image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
...@@ -104,3 +110,19 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -104,3 +110,19 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42}) self.assertEqual(image_processor.size, {"height": 42, "width": 42})
def test_padding(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
image = np.random.randn(3, 249, 491)
# test individual method
image = image_processing.pad_image(image, size_divisor=4)
self.assertTrue(image.shape[1] % 4 == 0)
self.assertTrue(image.shape[2] % 4 == 0)
# test by calling
pixel_values = image_processing.preprocess(
image, do_rescale=False, do_resize=False, do_pad=True, size_divisor=4, return_tensors="pt"
).pixel_values
self.assertTrue(pixel_values.shape[2] % 4 == 0)
self.assertTrue(pixel_values.shape[3] % 4 == 0)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch DPT model. """
import inspect
import unittest
from transformers import Dinov2Config, DPTConfig
from transformers.file_utils import is_torch_available, is_vision_available
from transformers.models.auto import get_values
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import MODEL_MAPPING, DPTForDepthEstimation
from transformers.models.dpt.modeling_dpt import DPT_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import DPTImageProcessor
class DPTModelTester:
def __init__(
self,
parent,
batch_size=2,
num_channels=3,
image_size=32,
patch_size=16,
use_labels=True,
num_labels=3,
is_training=True,
hidden_size=4,
num_hidden_layers=2,
num_attention_heads=2,
intermediate_size=8,
out_features=["stage1", "stage2"],
apply_layernorm=False,
reshape_hidden_states=False,
neck_hidden_sizes=[2, 2],
fusion_hidden_size=6,
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.patch_size = patch_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.out_features = out_features
self.apply_layernorm = apply_layernorm
self.reshape_hidden_states = reshape_hidden_states
self.use_labels = use_labels
self.num_labels = num_labels
self.is_training = is_training
self.neck_hidden_sizes = neck_hidden_sizes
self.fusion_hidden_size = fusion_hidden_size
# DPT's sequence length
self.seq_length = (self.image_size // self.patch_size) ** 2 + 1
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return DPTConfig(
backbone_config=self.get_backbone_config(),
neck_hidden_sizes=self.neck_hidden_sizes,
fusion_hidden_size=self.fusion_hidden_size,
)
def get_backbone_config(self):
return Dinov2Config(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
is_training=self.is_training,
out_features=self.out_features,
reshape_hidden_states=self.reshape_hidden_states,
)
def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
config.num_labels = self.num_labels
model = DPTForDepthEstimation(config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, labels = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (DPTForDepthEstimation,) if is_torch_available() else ()
pipeline_model_mapping = {"depth-estimation": DPTForDepthEstimation} if is_torch_available() else {}
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
def setUp(self):
self.model_tester = DPTModelTester(self)
self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings")
def test_inputs_embeds(self):
pass
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_for_depth_estimation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
def test_training(self):
for model_class in self.all_model_classes:
if model_class.__name__ == "DPTForDepthEstimation":
continue
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
if model_class in get_values(MODEL_MAPPING):
continue
model = model_class(config)
model.to(torch_device)
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
loss = model(**inputs).loss
loss.backward()
def test_training_gradient_checkpointing(self):
for model_class in self.all_model_classes:
if model_class.__name__ == "DPTForDepthEstimation":
continue
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_cache = False
config.return_dict = True
if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing:
continue
model = model_class(config)
model.to(torch_device)
model.gradient_checkpointing_enable()
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
loss = model(**inputs).loss
loss.backward()
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
# Skip the check for the backbone
backbone_params = []
for name, module in model.named_modules():
if module.__class__.__name__ == "DPTViTHybridEmbeddings":
backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
break
for name, param in model.named_parameters():
if param.requires_grad:
if name in backbone_params:
continue
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings")
def test_model_common_attributes(self):
pass
@unittest.skip(reason="DPT with AutoBackbone does not have a base model")
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="DPT with AutoBackbone does not have a base model")
def test_save_load_fast_init_to_base(self):
pass
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant(self):
pass
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@slow
def test_model_from_pretrained(self):
for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = DPTForDepthEstimation.from_pretrained(model_name)
self.assertIsNotNone(model)
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torch
@require_vision
@slow
class DPTModelIntegrationTest(unittest.TestCase):
def test_inference_depth_estimation(self):
image_processor = DPTImageProcessor.from_pretrained("facebook/dpt-dinov2-small-kitti")
model = DPTForDepthEstimation.from_pretrained("facebook/dpt-dinov2-small-kitti").to(torch_device)
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
predicted_depth = outputs.predicted_depth
# verify the predicted depth
expected_shape = torch.Size((1, 576, 736))
self.assertEqual(predicted_depth.shape, expected_shape)
expected_slice = torch.tensor(
[[6.0433, 7.1636, 7.4268], [6.9047, 7.2471, 7.2355], [7.9261, 8.0631, 8.0244]]
).to(torch_device)
self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment