"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "3104036e7f1a3cd6e07a69d648c3597de32f72fe"
Unverified Commit bdf36dcd authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Enable HF pretrained backbones (#31145)

* Enable load HF or tim backbone checkpoints

* Fix up

* Fix test - pass in proper out_indices

* Update docs

* Fix tvp tests

* Fix doc examples

* Fix doc examples

* Try to resolve DPT backbone param init

* Don't conditionally set to None

* Add condition based on whether backbone is defined

* Address review comments
parent a3d351c0
...@@ -327,31 +327,21 @@ For example, to load a [ResNet](../model_doc/resnet) backbone into a [MaskFormer ...@@ -327,31 +327,21 @@ For example, to load a [ResNet](../model_doc/resnet) backbone into a [MaskFormer
Set `use_pretrained_backbone=True` to load pretrained ResNet weights for the backbone. Set `use_pretrained_backbone=True` to load pretrained ResNet weights for the backbone.
```py ```py
from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=True) # backbone and neck config config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) # backbone and neck config
model = MaskFormerForInstanceSegmentation(config) # head model = MaskFormerForInstanceSegmentation(config) # head
``` ```
You could also load the backbone config separately and then pass it to the model config.
```py
from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
config = MaskFormerConfig(backbone_config=backbone_config)
model = MaskFormerForInstanceSegmentation(config)
```
</hfoption> </hfoption>
<hfoption id="random weights"> <hfoption id="random weights">
Set `use_pretrained_backbone=False` to randomly initialize a ResNet backbone. Set `use_pretrained_backbone=False` to randomly initialize a ResNet backbone.
```py ```py
from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=False) # backbone and neck config config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) # backbone and neck config
model = MaskFormerForInstanceSegmentation(config) # head model = MaskFormerForInstanceSegmentation(config) # head
``` ```
...@@ -366,15 +356,43 @@ model = MaskFormerForInstanceSegmentation(config) ...@@ -366,15 +356,43 @@ model = MaskFormerForInstanceSegmentation(config)
``` ```
</hfoption> </hfoption>
</hfoptions> </hfoptions id="timm backbone">
[timm](https://hf.co/docs/timm/index) models are loaded within a model with `use_timm_backbone=True` or with [`TimmBackbone`] and [`TimmBackboneConfig`].
[timm](https://hf.co/docs/timm/index) models are loaded with [`TimmBackbone`] and [`TimmBackboneConfig`]. Use `use_timm_backbone=True` and `use_pretrained_backbone=True` to load pretrained timm weights for the backbone.
```python
from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) # backbone and neck config
model = MaskFormerForInstanceSegmentation(config) # head
```
Set `use_timm_backbone=True` and `use_pretrained_backbone=False` to load a randomly initialized timm backbone.
```python
from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) # backbone and neck config
model = MaskFormerForInstanceSegmentation(config) # head
```
You could also load the backbone config and use it to create a `TimmBackbone` or pass it to the model config. Timm backbones will load pretrained weights by default. Set `use_pretrained_backbone=False` to load randomly initialized weights.
```python ```python
from transformers import TimmBackboneConfig, TimmBackbone from transformers import TimmBackboneConfig, TimmBackbone
backbone_config = TimmBackboneConfig("resnet50") backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=False)
model = TimmBackbone(config=backbone_config)
# Create a backbone class
backbone = TimmBackbone(config=backbone_config)
# Create a model with a timm backbone
from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
config = MaskFormerConfig(backbone_config=backbone_config)
model = MaskFormerForInstanceSegmentation(config)
``` ```
## Feature extractor ## Feature extractor
......
...@@ -378,7 +378,14 @@ class ConditionalDetrConvEncoder(nn.Module): ...@@ -378,7 +378,14 @@ class ConditionalDetrConvEncoder(nn.Module):
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
) )
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type backbone_model_type = None
if config.backbone is not None:
backbone_model_type = config.backbone
elif config.backbone_config is not None:
backbone_model_type = config.backbone_config.model_type
else:
raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
if "resnet" in backbone_model_type: if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters(): for name, parameter in self.model.named_parameters():
if config.use_timm_backbone: if config.use_timm_backbone:
......
...@@ -449,7 +449,14 @@ class DeformableDetrConvEncoder(nn.Module): ...@@ -449,7 +449,14 @@ class DeformableDetrConvEncoder(nn.Module):
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
) )
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type backbone_model_type = None
if config.backbone is not None:
backbone_model_type = config.backbone
elif config.backbone_config is not None:
backbone_model_type = config.backbone_config.model_type
else:
raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
if "resnet" in backbone_model_type: if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters(): for name, parameter in self.model.named_parameters():
if config.use_timm_backbone: if config.use_timm_backbone:
......
...@@ -129,6 +129,8 @@ class DepthAnythingConfig(PretrainedConfig): ...@@ -129,6 +129,8 @@ class DepthAnythingConfig(PretrainedConfig):
self.backbone_config = backbone_config self.backbone_config = backbone_config
self.backbone = backbone self.backbone = backbone
self.use_pretrained_backbone = use_pretrained_backbone self.use_pretrained_backbone = use_pretrained_backbone
self.use_timm_backbone = use_timm_backbone
self.backbone_kwargs = backbone_kwargs
self.reassemble_hidden_size = reassemble_hidden_size self.reassemble_hidden_size = reassemble_hidden_size
self.patch_size = patch_size self.patch_size = patch_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
......
...@@ -28,7 +28,7 @@ from ...file_utils import ( ...@@ -28,7 +28,7 @@ from ...file_utils import (
from ...modeling_outputs import DepthEstimatorOutput from ...modeling_outputs import DepthEstimatorOutput
from ...modeling_utils import PreTrainedModel from ...modeling_utils import PreTrainedModel
from ...utils import logging from ...utils import logging
from ..auto import AutoBackbone from ...utils.backbone_utils import load_backbone
from .configuration_depth_anything import DepthAnythingConfig from .configuration_depth_anything import DepthAnythingConfig
...@@ -365,9 +365,7 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel): ...@@ -365,9 +365,7 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.backbone = AutoBackbone.from_config( self.backbone = load_backbone(config)
config.backbone_config, attn_implementation=config._attn_implementation
)
self.neck = DepthAnythingNeck(config) self.neck = DepthAnythingNeck(config)
self.head = DepthAnythingDepthEstimationHead(config) self.head = DepthAnythingDepthEstimationHead(config)
......
...@@ -373,7 +373,14 @@ class DetrConvEncoder(nn.Module): ...@@ -373,7 +373,14 @@ class DetrConvEncoder(nn.Module):
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
) )
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type backbone_model_type = None
if config.backbone is not None:
backbone_model_type = config.backbone
elif config.backbone_config is not None:
backbone_model_type = config.backbone_config.model_type
else:
raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
if "resnet" in backbone_model_type: if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters(): for name, parameter in self.model.named_parameters():
if config.use_timm_backbone: if config.use_timm_backbone:
......
...@@ -182,8 +182,7 @@ class DPTConfig(PretrainedConfig): ...@@ -182,8 +182,7 @@ class DPTConfig(PretrainedConfig):
use_autobackbone = False use_autobackbone = False
if self.is_hybrid: if self.is_hybrid:
if backbone_config is None and backbone is None: if backbone_config is None:
logger.info("Initializing the config with a `BiT` backbone.")
backbone_config = { backbone_config = {
"global_padding": "same", "global_padding": "same",
"layer_type": "bottleneck", "layer_type": "bottleneck",
...@@ -191,8 +190,8 @@ class DPTConfig(PretrainedConfig): ...@@ -191,8 +190,8 @@ class DPTConfig(PretrainedConfig):
"out_features": ["stage1", "stage2", "stage3"], "out_features": ["stage1", "stage2", "stage3"],
"embedding_dynamic_padding": True, "embedding_dynamic_padding": True,
} }
backbone_config = BitConfig(**backbone_config)
elif isinstance(backbone_config, dict): if isinstance(backbone_config, dict):
logger.info("Initializing the config with a `BiT` backbone.") logger.info("Initializing the config with a `BiT` backbone.")
backbone_config = BitConfig(**backbone_config) backbone_config = BitConfig(**backbone_config)
elif isinstance(backbone_config, PretrainedConfig): elif isinstance(backbone_config, PretrainedConfig):
...@@ -208,9 +207,8 @@ class DPTConfig(PretrainedConfig): ...@@ -208,9 +207,8 @@ class DPTConfig(PretrainedConfig):
if readout_type != "project": if readout_type != "project":
raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.") raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.")
elif backbone_config is not None: elif backbone is not None or backbone_config is not None:
use_autobackbone = True use_autobackbone = True
if isinstance(backbone_config, dict): if isinstance(backbone_config, dict):
backbone_model_type = backbone_config.get("model_type") backbone_model_type = backbone_config.get("model_type")
config_class = CONFIG_MAPPING[backbone_model_type] config_class = CONFIG_MAPPING[backbone_model_type]
...@@ -219,33 +217,37 @@ class DPTConfig(PretrainedConfig): ...@@ -219,33 +217,37 @@ class DPTConfig(PretrainedConfig):
self.backbone_config = backbone_config self.backbone_config = backbone_config
self.backbone_featmap_shape = None self.backbone_featmap_shape = None
self.neck_ignore_stages = [] self.neck_ignore_stages = []
# We only use load_backbone when config.is_hydrid is False
verify_backbone_config_arguments(
use_timm_backbone=use_timm_backbone,
use_pretrained_backbone=use_pretrained_backbone,
backbone=backbone,
backbone_config=backbone_config,
backbone_kwargs=backbone_kwargs,
)
else: else:
self.backbone_config = backbone_config self.backbone_config = None
self.backbone_featmap_shape = None self.backbone_featmap_shape = None
self.neck_ignore_stages = [] self.neck_ignore_stages = []
verify_backbone_config_arguments(
use_timm_backbone=use_timm_backbone,
use_pretrained_backbone=use_pretrained_backbone,
backbone=backbone,
backbone_config=backbone_config,
backbone_kwargs=backbone_kwargs,
)
self.backbone = backbone self.backbone = backbone
self.use_pretrained_backbone = use_pretrained_backbone self.use_pretrained_backbone = use_pretrained_backbone
self.use_timm_backbone = use_timm_backbone self.use_timm_backbone = use_timm_backbone
self.backbone_kwargs = backbone_kwargs self.backbone_kwargs = backbone_kwargs
self.num_hidden_layers = None if use_autobackbone else num_hidden_layers
self.num_attention_heads = None if use_autobackbone else num_attention_heads # ViT parameters used if not using a hybrid backbone
self.intermediate_size = None if use_autobackbone else intermediate_size self.num_hidden_layers = num_hidden_layers
self.hidden_dropout_prob = None if use_autobackbone else hidden_dropout_prob self.num_attention_heads = num_attention_heads
self.attention_probs_dropout_prob = None if use_autobackbone else attention_probs_dropout_prob self.intermediate_size = intermediate_size
self.layer_norm_eps = None if use_autobackbone else layer_norm_eps self.hidden_dropout_prob = hidden_dropout_prob
self.image_size = None if use_autobackbone else image_size self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.patch_size = None if use_autobackbone else patch_size self.layer_norm_eps = layer_norm_eps
self.num_channels = None if use_autobackbone else num_channels self.image_size = image_size
self.qkv_bias = None if use_autobackbone else qkv_bias self.patch_size = patch_size
self.num_channels = num_channels
self.qkv_bias = qkv_bias
self.use_autobackbone = use_autobackbone
self.backbone_out_indices = None if use_autobackbone else backbone_out_indices self.backbone_out_indices = None if use_autobackbone else backbone_out_indices
if readout_type not in ["ignore", "add", "project"]: if readout_type not in ["ignore", "add", "project"]:
......
...@@ -1071,10 +1071,10 @@ class DPTForDepthEstimation(DPTPreTrainedModel): ...@@ -1071,10 +1071,10 @@ class DPTForDepthEstimation(DPTPreTrainedModel):
super().__init__(config) super().__init__(config)
self.backbone = None self.backbone = None
if config.is_hybrid or config.backbone_config is None: if config.is_hybrid is False and (config.backbone_config is not None or config.backbone is not None):
self.dpt = DPTModel(config, add_pooling_layer=False)
else:
self.backbone = load_backbone(config) self.backbone = load_backbone(config)
else:
self.dpt = DPTModel(config, add_pooling_layer=False)
# Neck # Neck
self.neck = DPTNeck(config) self.neck = DPTNeck(config)
......
...@@ -463,7 +463,14 @@ class GroundingDinoConvEncoder(nn.Module): ...@@ -463,7 +463,14 @@ class GroundingDinoConvEncoder(nn.Module):
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
) )
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type backbone_model_type = None
if config.backbone is not None:
backbone_model_type = config.backbone
elif config.backbone_config is not None:
backbone_model_type = config.backbone_config.model_type
else:
raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
if "resnet" in backbone_model_type: if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters(): for name, parameter in self.model.named_parameters():
if config.use_timm_backbone: if config.use_timm_backbone:
......
...@@ -305,7 +305,14 @@ class TableTransformerConvEncoder(nn.Module): ...@@ -305,7 +305,14 @@ class TableTransformerConvEncoder(nn.Module):
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
) )
backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type backbone_model_type = None
if config.backbone is not None:
backbone_model_type = config.backbone
elif config.backbone_config is not None:
backbone_model_type = config.backbone_config.model_type
else:
raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
if "resnet" in backbone_model_type: if "resnet" in backbone_model_type:
for name, parameter in self.model.named_parameters(): for name, parameter in self.model.named_parameters():
if config.use_timm_backbone: if config.use_timm_backbone:
......
...@@ -50,8 +50,10 @@ class TimmBackbone(PreTrainedModel, BackboneMixin): ...@@ -50,8 +50,10 @@ class TimmBackbone(PreTrainedModel, BackboneMixin):
if config.backbone is None: if config.backbone is None:
raise ValueError("backbone is not set in the config. Please set it to a timm model name.") raise ValueError("backbone is not set in the config. Please set it to a timm model name.")
if config.backbone not in timm.list_models(): # Certain timm models have the structure `model_name.version` e.g. vit_large_patch14_dinov2.lvd142m
raise ValueError(f"backbone {config.backbone} is not supported by timm.") base_backbone_model = config.backbone.split(".")[0]
if base_backbone_model not in timm.list_models():
raise ValueError(f"backbone {base_backbone_model} is not supported by timm.")
if hasattr(config, "out_features") and config.out_features is not None: if hasattr(config, "out_features") and config.out_features is not None:
raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.") raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.")
......
...@@ -143,8 +143,18 @@ class TvpVisionModel(nn.Module): ...@@ -143,8 +143,18 @@ class TvpVisionModel(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.backbone = load_backbone(config) self.backbone = load_backbone(config)
if config.backbone_config is not None:
in_channels = config.backbone_config.hidden_sizes[-1]
elif hasattr(self.backbone, "config") and hasattr(self.backbone.config, "hidden_sizes"):
in_channels = self.backbone.config.hidden_sizes[-1]
elif hasattr(self.backbone, "config") and hasattr(self.backbone.config, "hidden_size"):
in_channels = self.backbone.config.hidden_size
else:
raise ValueError("Backbone config not found")
self.grid_encoder_conv = nn.Conv2d( self.grid_encoder_conv = nn.Conv2d(
config.backbone_config.hidden_sizes[-1], in_channels,
config.hidden_size, config.hidden_size,
kernel_size=3, kernel_size=3,
stride=1, stride=1,
......
...@@ -115,7 +115,12 @@ class VitMatteConvStream(nn.Module): ...@@ -115,7 +115,12 @@ class VitMatteConvStream(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
in_channels = config.backbone_config.num_channels # We use a default in-case there isn't a backbone config set. This is for backwards compatibility and
# to enable loading HF backbone models.
in_channels = 4
if config.backbone_config is not None:
in_channels = config.backbone_config.num_channels
out_channels = config.convstream_hidden_sizes out_channels = config.convstream_hidden_sizes
self.convs = nn.ModuleList() self.convs = nn.ModuleList()
......
...@@ -368,11 +368,6 @@ def verify_backbone_config_arguments( ...@@ -368,11 +368,6 @@ def verify_backbone_config_arguments(
""" """
Verify that the config arguments to be passed to load_backbone are valid Verify that the config arguments to be passed to load_backbone are valid
""" """
if not use_timm_backbone and use_pretrained_backbone:
raise ValueError(
"Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`"
)
if backbone_config is not None and backbone is not None: if backbone_config is not None and backbone is not None:
raise ValueError("You can't specify both `backbone` and `backbone_config`.") raise ValueError("You can't specify both `backbone` and `backbone_config`.")
......
...@@ -476,6 +476,42 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline ...@@ -476,6 +476,42 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
self.assertTrue(outputs) self.assertTrue(outputs)
@require_timm
def test_hf_backbone(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Load a pretrained HF checkpoint as backbone
config.backbone = "microsoft/resnet-18"
config.backbone_config = None
config.use_timm_backbone = False
config.use_pretrained_backbone = True
config.backbone_kwargs = {"out_indices": [2, 3, 4]}
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
if model_class.__name__ == "ConditionalDetrForObjectDetection":
expected_shape = (
self.model_tester.batch_size,
self.model_tester.num_queries,
self.model_tester.num_labels,
)
self.assertEqual(outputs.logits.shape, expected_shape)
# Confirm out_indices was propogated to backbone
self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
elif model_class.__name__ == "ConditionalDetrForSegmentation":
# Confirm out_indices was propogated to backbone
self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
else:
# Confirm out_indices was propogated to backbone
self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
self.assertTrue(outputs)
def test_initialization(self): def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
...@@ -544,9 +544,38 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT ...@@ -544,9 +544,38 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
self.assertEqual(outputs.logits.shape, expected_shape) self.assertEqual(outputs.logits.shape, expected_shape)
# Confirm out_indices was propogated to backbone # Confirm out_indices was propogated to backbone
self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4) self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
elif model_class.__name__ == "ConditionalDetrForSegmentation": else:
# Confirm out_indices was propogated to backbone
self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
self.assertTrue(outputs)
def test_hf_backbone(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Load a pretrained HF checkpoint as backbone
config.backbone = "microsoft/resnet-18"
config.backbone_config = None
config.use_timm_backbone = False
config.use_pretrained_backbone = True
config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]}
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
if model_class.__name__ == "DeformableDetrForObjectDetection":
expected_shape = (
self.model_tester.batch_size,
self.model_tester.num_queries,
self.model_tester.num_labels,
)
self.assertEqual(outputs.logits.shape, expected_shape)
# Confirm out_indices was propogated to backbone # Confirm out_indices was propogated to backbone
self.assertEqual(len(model.deformable_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 4) self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
else: else:
# Confirm out_indices was propogated to backbone # Confirm out_indices was propogated to backbone
self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4) self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
......
...@@ -207,6 +207,35 @@ class DepthAnythingModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes ...@@ -207,6 +207,35 @@ class DepthAnythingModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
model = DepthAnythingForDepthEstimation.from_pretrained(model_name) model = DepthAnythingForDepthEstimation.from_pretrained(model_name)
self.assertIsNotNone(model) self.assertIsNotNone(model)
def test_backbone_selection(self):
def _validate_backbone_init():
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
# Confirm out_indices propogated to backbone
self.assertEqual(len(model.backbone.out_indices), 2)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Load a timm backbone
config.backbone = "resnet18"
config.use_pretrained_backbone = True
config.use_timm_backbone = True
config.backbone_config = None
# For transformer backbones we can't set the out_indices or just return the features
config.backbone_kwargs = {"out_indices": (-2, -1)}
_validate_backbone_init()
# Load a HF backbone
config.backbone = "facebook/dinov2-small"
config.use_pretrained_backbone = True
config.use_timm_backbone = False
config.backbone_config = None
config.backbone_kwargs = {"out_indices": [-2, -1]}
_validate_backbone_init()
# We will verify our results on an image of cute cats # We will verify our results on an image of cute cats
def prepare_img(): def prepare_img():
......
...@@ -476,6 +476,41 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin ...@@ -476,6 +476,41 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
self.assertTrue(outputs) self.assertTrue(outputs)
def test_hf_backbone(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Load a pretrained HF checkpoint as backbone
config.backbone = "microsoft/resnet-18"
config.backbone_config = None
config.use_timm_backbone = False
config.use_pretrained_backbone = True
config.backbone_kwargs = {"out_indices": [2, 3, 4]}
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
if model_class.__name__ == "DetrForObjectDetection":
expected_shape = (
self.model_tester.batch_size,
self.model_tester.num_queries,
self.model_tester.num_labels + 1,
)
self.assertEqual(outputs.logits.shape, expected_shape)
# Confirm out_indices was propogated to backbone
self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
elif model_class.__name__ == "DetrForSegmentation":
# Confirm out_indices was propogated to backbone
self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
else:
# Confirm out_indices was propogated to backbone
self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
self.assertTrue(outputs)
def test_greyscale_images(self): def test_greyscale_images(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
...@@ -276,6 +276,34 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): ...@@ -276,6 +276,34 @@ class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
msg=f"Parameter {name} of model {model_class} seems not properly initialized", msg=f"Parameter {name} of model {model_class} seems not properly initialized",
) )
def test_backbone_selection(self):
def _validate_backbone_init():
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
if model.__class__.__name__ == "DPTForDepthEstimation":
# Confirm out_indices propogated to backbone
self.assertEqual(len(model.backbone.out_indices), 2)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_pretrained_backbone = True
config.backbone_config = None
config.backbone_kwargs = {"out_indices": [-2, -1]}
# Force load_backbone path
config.is_hybrid = False
# Load a timm backbone
config.backbone = "resnet18"
config.use_timm_backbone = True
_validate_backbone_init()
# Load a HF backbone
config.backbone = "facebook/dinov2-small"
config.use_timm_backbone = False
_validate_backbone_init()
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
model_name = "Intel/dpt-large" model_name = "Intel/dpt-large"
......
...@@ -501,6 +501,34 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes ...@@ -501,6 +501,34 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
self.assertTrue(outputs) self.assertTrue(outputs)
@require_timm
def test_hf_backbone(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Load a pretrained HF checkpoint as backbone
config.backbone = "microsoft/resnet-18"
config.backbone_config = None
config.use_timm_backbone = False
config.use_pretrained_backbone = True
config.backbone_kwargs = {"out_indices": [2, 3, 4]}
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
if model_class.__name__ == "GroundingDinoForObjectDetection":
expected_shape = (
self.model_tester.batch_size,
self.model_tester.num_queries,
config.max_text_len,
)
self.assertEqual(outputs.logits.shape, expected_shape)
self.assertTrue(outputs)
def test_initialization(self): def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment