Unverified Commit c0452490 authored by Shehan Munasinghe's avatar Shehan Munasinghe Committed by GitHub
Browse files

Add swiftformer (#22686)



* Commit the automatically generated code

using add-new-model-like

* Update description at swiftformer.mdx file

* remove autogenerated code for MaskedImageModeling

* update weight conversion scripts

* Update modeling_swiftformer.py

* update configuration_swiftformer.py

* Update test_modeling_swiftformer.py

* update modeling code - remove einops dependency

* Update _toctree.yml

* update modeling code - remove copied from comments

* update docs

* Revert "update docs"

This reverts commit c2e05e2998fe2cd6eaee8b8cc31aca5222bac9fb.

* update docs

* remove unused reference SwiftFormerImageProcessor

* update dependency_versions_table.py

* update swiftformer.mdx

* update swiftformer.mdx

* change model output type - no attentions

* update model org name

* Fix typo

* fix copies

* Update tests/models/swiftformer/test_modeling_swiftformer.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/auto/feature_extraction_auto.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/swiftformer.mdx
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/swiftformer/configuration_swiftformer.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update modeling_swiftformer.py

fix-copies

* make style, make quality, fix-copies

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make style
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make fix-copies

* Update modeling_swiftformer.py

* Update modeling_swiftformer.py

* Add suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 364ced68
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert SwiftFormer checkpoints from the original implementation."""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
SwiftFormerConfig,
SwiftFormerForImageClassification,
ViTImageProcessor,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
device = torch.device("cpu")
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
def get_expected_output(swiftformer_name):
if swiftformer_name == "swiftformer_xs":
return torch.tensor([-2.1703e00, 2.1107e00, -2.0811e00, 8.8685e-01, 2.4360e-01])
elif swiftformer_name == "swiftformer_s":
return torch.tensor([3.9636e-01, 2.3478e-01, -1.6963e00, -1.7381e00, -8.6337e-01])
elif swiftformer_name == "swiftformer_l1":
return torch.tensor([-4.2768e-01, -4.7429e-01, -1.0897e00, -1.0248e00, 3.5523e-02])
elif swiftformer_name == "swiftformer_l3":
return torch.tensor([-2.5330e-01, 2.4211e-01, -6.0185e-01, -8.2789e-01, -6.0446e-02])
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def create_rename_keys(state_dict):
rename_keys = []
for k in state_dict.keys():
k_new = k
if ".pwconv" in k:
k_new = k_new.replace(".pwconv", ".point_wise_conv")
if ".dwconv" in k:
k_new = k_new.replace(".dwconv", ".depth_wise_conv")
if ".Proj." in k:
k_new = k_new.replace(".Proj.", ".proj.")
if "patch_embed" in k_new:
k_new = k_new.replace("patch_embed", "swiftformer.patch_embed.patch_embedding")
if "network" in k_new:
ls = k_new.split(".")
if ls[2].isdigit():
k_new = "swiftformer.encoder.network." + ls[1] + ".blocks." + ls[2] + "." + ".".join(ls[3:])
else:
k_new = k_new.replace("network", "swiftformer.encoder.network")
rename_keys.append((k, k_new))
return rename_keys
@torch.no_grad()
def convert_swiftformer_checkpoint(swiftformer_name, pytorch_dump_folder_path, original_ckpt):
"""
Copy/paste/tweak model's weights to our SwiftFormer structure.
"""
# define default SwiftFormer configuration
config = SwiftFormerConfig()
# dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
config.num_labels = 1000
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
# size of the architecture
if swiftformer_name == "swiftformer_xs":
config.depths = [3, 3, 6, 4]
config.embed_dims = [48, 56, 112, 220]
elif swiftformer_name == "swiftformer_s":
config.depths = [3, 3, 9, 6]
config.embed_dims = [48, 64, 168, 224]
elif swiftformer_name == "swiftformer_l1":
config.depths = [4, 3, 10, 5]
config.embed_dims = [48, 96, 192, 384]
elif swiftformer_name == "swiftformer_l3":
config.depths = [4, 4, 12, 6]
config.embed_dims = [64, 128, 320, 512]
# load state_dict of original model, remove and rename some keys
if original_ckpt:
if original_ckpt.startswith("https"):
checkpoint = torch.hub.load_state_dict_from_url(original_ckpt, map_location="cpu", check_hash=True)
else:
checkpoint = torch.load(original_ckpt, map_location="cpu")
state_dict = checkpoint
rename_keys = create_rename_keys(state_dict)
for rename_key_src, rename_key_dest in rename_keys:
rename_key(state_dict, rename_key_src, rename_key_dest)
# load HuggingFace model
hf_model = SwiftFormerForImageClassification(config).eval()
hf_model.load_state_dict(state_dict)
# prepare test inputs
image = prepare_img()
processor = ViTImageProcessor.from_pretrained("preprocessor_config")
inputs = processor(images=image, return_tensors="pt")
# compare outputs from both models
timm_logits = get_expected_output(swiftformer_name)
hf_logits = hf_model(inputs["pixel_values"]).logits
assert hf_logits.shape == torch.Size([1, 1000])
assert torch.allclose(hf_logits[0, 0:5], timm_logits, atol=1e-3)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {swiftformer_name} to {pytorch_dump_folder_path}")
hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--swiftformer_name",
default="swiftformer_xs",
choices=["swiftformer_xs", "swiftformer_s", "swiftformer_l1", "swiftformer_l3"],
type=str,
help="Name of the SwiftFormer model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="./converted_outputs/",
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--original_ckpt", default=None, type=str, help="Path to the original model checkpoint.")
args = parser.parse_args()
convert_swiftformer_checkpoint(args.swiftformer_name, args.pytorch_dump_folder_path, args.original_ckpt)
This diff is collapsed.
...@@ -6440,6 +6440,30 @@ class SqueezeBertPreTrainedModel(metaclass=DummyObject): ...@@ -6440,6 +6440,30 @@ class SqueezeBertPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
class SwiftFormerForImageClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SwiftFormerModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SwiftFormerPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch SwiftFormer model. """
import copy
import inspect
import unittest
from transformers import PretrainedConfig, SwiftFormerConfig
from transformers.testing_utils import (
require_torch,
require_vision,
slow,
torch_device,
)
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from torch import nn
from transformers import SwiftFormerForImageClassification, SwiftFormerModel
from transformers.models.swiftformer.modeling_swiftformer import SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import ViTImageProcessor
class SwiftFormerModelTester:
def __init__(
self,
parent,
batch_size=13,
num_channels=3,
is_training=True,
use_labels=True,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
image_size=224,
num_labels=1000,
layer_depths=[3, 3, 6, 4],
embed_dims=[48, 56, 112, 220],
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.is_training = is_training
self.use_labels = use_labels
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.num_labels = num_labels
self.image_size = image_size
self.layer_depths = layer_depths
self.embed_dims = embed_dims
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.num_labels)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return SwiftFormerConfig(
depths=self.layer_depths,
embed_dims=self.embed_dims,
mlp_ratio=4,
downsamples=[True, True, True, True],
hidden_act="gelu",
num_labels=self.num_labels,
down_patch_size=3,
down_stride=2,
down_pad=1,
drop_rate=0.0,
drop_path_rate=0.0,
use_layer_scale=True,
layer_scale_init_value=1e-5,
)
def create_and_check_model(self, config, pixel_values, labels):
model = SwiftFormerModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dims[-1], 7, 7))
def create_and_check_for_image_classification(self, config, pixel_values, labels):
config.num_labels = self.num_labels
model = SwiftFormerForImageClassification(config)
model.to(torch_device)
model.eval()
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
model = SwiftFormerForImageClassification(config)
model.to(torch_device)
model.eval()
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
result = model(pixel_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def prepare_config_and_inputs_for_common(self):
(config, pixel_values, labels) = self.prepare_config_and_inputs()
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as SwiftFormer does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (SwiftFormerModel, SwiftFormerForImageClassification) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": SwiftFormerModel, "image-classification": SwiftFormerForImageClassification}
if is_torch_available()
else {}
)
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
has_attentions = False
def setUp(self):
self.model_tester = SwiftFormerModelTester(self)
self.config_tester = ConfigTester(
self,
config_class=SwiftFormerConfig,
has_text_modality=False,
hidden_size=37,
num_attention_heads=12,
num_hidden_layers=12,
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="SwiftFormer does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_image_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = SwiftFormerModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="SwiftFormer does not output attentions")
def test_attention_outputs(self):
pass
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_stages = 8
self.assertEqual(len(hidden_states), expected_num_stages) # TODO
# SwiftFormer's feature maps are of shape (batch_size, embed_dims, height, width)
# with the width and height being successively divided by 2, after every 2 blocks
for i in range(len(hidden_states)):
self.assertEqual(
hidden_states[i].shape,
torch.Size(
[
self.model_tester.batch_size,
self.model_tester.embed_dims[i // 2],
(self.model_tester.image_size // 4) // 2 ** (i // 2),
(self.model_tester.image_size // 4) // 2 ** (i // 2),
]
),
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
def test_initialization(self):
def _config_zero_init(config):
configs_no_init = copy.deepcopy(config)
for key in configs_no_init.__dict__.keys():
if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
setattr(configs_no_init, key, 1e-10)
if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
setattr(configs_no_init, key, no_init_subconfig)
return configs_no_init
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9) / 1e9).round().item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torch
@require_vision
class SwiftFormerModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = SwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs").to(torch_device)
feature_extractor = self.default_feature_extractor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 1000))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([[-2.1703e00, 2.1107e00, -2.0811e00]])
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment