Unverified Commit 9858ecd7 authored by Younes Belkada's avatar Younes Belkada Committed by GitHub
Browse files

[`ViTHybrid`] Fix `accelerate` slow tests (#20679)

* fix failing `accelerate` tests

* make fixup

* smaller values

* even lower
parent 69038ce0
...@@ -71,6 +71,8 @@ class ViTHybridConfig(PretrainedConfig): ...@@ -71,6 +71,8 @@ class ViTHybridConfig(PretrainedConfig):
Whether to add a bias to the queries, keys and values. Whether to add a bias to the queries, keys and values.
backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*, defaults to `None`): backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*, defaults to `None`):
The configuration of the backbone in a dictionary or the config object of the backbone. The configuration of the backbone in a dictionary or the config object of the backbone.
backbone_featmap_shape (`List[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
Example: Example:
...@@ -103,6 +105,7 @@ class ViTHybridConfig(PretrainedConfig): ...@@ -103,6 +105,7 @@ class ViTHybridConfig(PretrainedConfig):
image_size=224, image_size=224,
patch_size=1, patch_size=1,
num_channels=3, num_channels=3,
backbone_featmap_shape=[1, 1024, 24, 24],
qkv_bias=True, qkv_bias=True,
**kwargs **kwargs
): ):
...@@ -128,6 +131,7 @@ class ViTHybridConfig(PretrainedConfig): ...@@ -128,6 +131,7 @@ class ViTHybridConfig(PretrainedConfig):
backbone_config_class = BitConfig backbone_config_class = BitConfig
backbone_config = backbone_config_class(**backbone_config) backbone_config = backbone_config_class(**backbone_config)
self.backbone_featmap_shape = backbone_featmap_shape
self.backbone_config = backbone_config self.backbone_config = backbone_config
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
......
...@@ -166,11 +166,10 @@ class ViTHybridPatchEmbeddings(nn.Module): ...@@ -166,11 +166,10 @@ class ViTHybridPatchEmbeddings(nn.Module):
feature_dim = self.backbone.channels[-1] feature_dim = self.backbone.channels[-1]
if feature_size is None: if feature_size is None:
dummy_image = torch.zeros(1, num_channels, image_size[0], image_size[1]) feature_map = config.backbone_featmap_shape
with torch.no_grad():
feature_map = self.backbone(dummy_image).feature_maps[-1] feature_size = feature_map[-2:]
feature_size = feature_map.shape[-2:] feature_dim = feature_map[1]
feature_dim = feature_map.shape[1]
else: else:
feature_size = ( feature_size = (
feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size) feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
......
...@@ -19,7 +19,7 @@ import inspect ...@@ -19,7 +19,7 @@ import inspect
import unittest import unittest
from transformers import ViTHybridConfig from transformers import ViTHybridConfig
from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.testing_utils import require_accelerate, require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
...@@ -57,6 +57,7 @@ class ViTHybridModelTester: ...@@ -57,6 +57,7 @@ class ViTHybridModelTester:
attention_probs_dropout_prob=0.1, attention_probs_dropout_prob=0.1,
type_sequence_label_size=10, type_sequence_label_size=10,
initializer_range=0.02, initializer_range=0.02,
backbone_featmap_shape=[1, 16, 4, 4],
scope=None, scope=None,
): ):
self.parent = parent self.parent = parent
...@@ -76,6 +77,7 @@ class ViTHybridModelTester: ...@@ -76,6 +77,7 @@ class ViTHybridModelTester:
self.type_sequence_label_size = type_sequence_label_size self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.scope = scope self.scope = scope
self.backbone_featmap_shape = backbone_featmap_shape
# in ViT hybrid, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) # in ViT hybrid, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
# the number of patches is based on the feature map of the backbone, which by default uses an output stride # the number of patches is based on the feature map of the backbone, which by default uses an output stride
...@@ -95,6 +97,16 @@ class ViTHybridModelTester: ...@@ -95,6 +97,16 @@ class ViTHybridModelTester:
return config, pixel_values, labels return config, pixel_values, labels
def get_config(self): def get_config(self):
backbone_config = {
"global_padding": "same",
"layer_type": "bottleneck",
"depths": [3, 4, 9],
"out_features": ["stage1", "stage2", "stage3"],
"embedding_dynamic_padding": True,
"hidden_sizes": [4, 8, 16, 32],
"num_groups": 2,
}
return ViTHybridConfig( return ViTHybridConfig(
image_size=self.image_size, image_size=self.image_size,
patch_size=self.patch_size, patch_size=self.patch_size,
...@@ -108,6 +120,8 @@ class ViTHybridModelTester: ...@@ -108,6 +120,8 @@ class ViTHybridModelTester:
attention_probs_dropout_prob=self.attention_probs_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob,
is_decoder=False, is_decoder=False,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
backbone_featmap_shape=self.backbone_featmap_shape,
backbone_config=backbone_config,
) )
def create_and_check_model(self, config, pixel_values, labels): def create_and_check_model(self, config, pixel_values, labels):
...@@ -229,3 +243,19 @@ class ViTModelIntegrationTest(unittest.TestCase): ...@@ -229,3 +243,19 @@ class ViTModelIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor([-1.9090, -0.4993, -0.2389]).to(torch_device) expected_slice = torch.tensor([-1.9090, -0.4993, -0.2389]).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
@slow
@require_accelerate
def test_accelerate_inference(self):
feature_extractor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
self.assertTrue(model.config.id2label[predicted_class_idx], "tabby, tabby cat")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment