Unverified Commit c0452490 authored by Shehan Munasinghe's avatar Shehan Munasinghe Committed by GitHub
Browse files

Add swiftformer (#22686)



* Commit the automatically generated code

using add-new-model-like

* Update description at swiftformer.mdx file

* remove autogenerated code for MaskedImageModeling

* update weight conversion scripts

* Update modeling_swiftformer.py

* update configuration_swiftformer.py

* Update test_modeling_swiftformer.py

* update modeling code - remove einops dependency

* Update _toctree.yml

* update modeling code - remove copied from comments

* update docs

* Revert "update docs"

This reverts commit c2e05e2998fe2cd6eaee8b8cc31aca5222bac9fb.

* update docs

* remove unused reference SwiftFormerImageProcessor

* update dependency_versions_table.py

* update swiftformer.mdx

* update swiftformer.mdx

* change model output type - no attentions

* update model org name

* Fix typo

* fix copies

* Update tests/models/swiftformer/test_modeling_swiftformer.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/auto/feature_extraction_auto.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/swiftformer.mdx
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/swiftformer/configuration_swiftformer.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update modeling_swiftformer.py

fix-copies

* make style, make quality, fix-copies

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make style
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make fix-copies

* Update modeling_swiftformer.py

* Update modeling_swiftformer.py

* Add suggestions from code review
Co-Authored-By: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 364ced68
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert SwiftFormer checkpoints from the original implementation."""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
SwiftFormerConfig,
SwiftFormerForImageClassification,
ViTImageProcessor,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
device = torch.device("cpu")
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
def get_expected_output(swiftformer_name):
if swiftformer_name == "swiftformer_xs":
return torch.tensor([-2.1703e00, 2.1107e00, -2.0811e00, 8.8685e-01, 2.4360e-01])
elif swiftformer_name == "swiftformer_s":
return torch.tensor([3.9636e-01, 2.3478e-01, -1.6963e00, -1.7381e00, -8.6337e-01])
elif swiftformer_name == "swiftformer_l1":
return torch.tensor([-4.2768e-01, -4.7429e-01, -1.0897e00, -1.0248e00, 3.5523e-02])
elif swiftformer_name == "swiftformer_l3":
return torch.tensor([-2.5330e-01, 2.4211e-01, -6.0185e-01, -8.2789e-01, -6.0446e-02])
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def create_rename_keys(state_dict):
rename_keys = []
for k in state_dict.keys():
k_new = k
if ".pwconv" in k:
k_new = k_new.replace(".pwconv", ".point_wise_conv")
if ".dwconv" in k:
k_new = k_new.replace(".dwconv", ".depth_wise_conv")
if ".Proj." in k:
k_new = k_new.replace(".Proj.", ".proj.")
if "patch_embed" in k_new:
k_new = k_new.replace("patch_embed", "swiftformer.patch_embed.patch_embedding")
if "network" in k_new:
ls = k_new.split(".")
if ls[2].isdigit():
k_new = "swiftformer.encoder.network." + ls[1] + ".blocks." + ls[2] + "." + ".".join(ls[3:])
else:
k_new = k_new.replace("network", "swiftformer.encoder.network")
rename_keys.append((k, k_new))
return rename_keys
@torch.no_grad()
def convert_swiftformer_checkpoint(swiftformer_name, pytorch_dump_folder_path, original_ckpt):
"""
Copy/paste/tweak model's weights to our SwiftFormer structure.
"""
# define default SwiftFormer configuration
config = SwiftFormerConfig()
# dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
config.num_labels = 1000
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
# size of the architecture
if swiftformer_name == "swiftformer_xs":
config.depths = [3, 3, 6, 4]
config.embed_dims = [48, 56, 112, 220]
elif swiftformer_name == "swiftformer_s":
config.depths = [3, 3, 9, 6]
config.embed_dims = [48, 64, 168, 224]
elif swiftformer_name == "swiftformer_l1":
config.depths = [4, 3, 10, 5]
config.embed_dims = [48, 96, 192, 384]
elif swiftformer_name == "swiftformer_l3":
config.depths = [4, 4, 12, 6]
config.embed_dims = [64, 128, 320, 512]
# load state_dict of original model, remove and rename some keys
if original_ckpt:
if original_ckpt.startswith("https"):
checkpoint = torch.hub.load_state_dict_from_url(original_ckpt, map_location="cpu", check_hash=True)
else:
checkpoint = torch.load(original_ckpt, map_location="cpu")
state_dict = checkpoint
rename_keys = create_rename_keys(state_dict)
for rename_key_src, rename_key_dest in rename_keys:
rename_key(state_dict, rename_key_src, rename_key_dest)
# load HuggingFace model
hf_model = SwiftFormerForImageClassification(config).eval()
hf_model.load_state_dict(state_dict)
# prepare test inputs
image = prepare_img()
processor = ViTImageProcessor.from_pretrained("preprocessor_config")
inputs = processor(images=image, return_tensors="pt")
# compare outputs from both models
timm_logits = get_expected_output(swiftformer_name)
hf_logits = hf_model(inputs["pixel_values"]).logits
assert hf_logits.shape == torch.Size([1, 1000])
assert torch.allclose(hf_logits[0, 0:5], timm_logits, atol=1e-3)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {swiftformer_name} to {pytorch_dump_folder_path}")
hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--swiftformer_name",
default="swiftformer_xs",
choices=["swiftformer_xs", "swiftformer_s", "swiftformer_l1", "swiftformer_l3"],
type=str,
help="Name of the SwiftFormer model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="./converted_outputs/",
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--original_ckpt", default=None, type=str, help="Path to the original model checkpoint.")
args = parser.parse_args()
convert_swiftformer_checkpoint(args.swiftformer_name, args.pytorch_dump_folder_path, args.original_ckpt)
# coding=utf-8
# Copyright 2023 MBZUAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch SwiftFormer model."""
import collections.abc
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2CLS
from ...modeling_outputs import (
BaseModelOutputWithNoAttention,
ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_swiftformer import SwiftFormerConfig
logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "SwiftFormerConfig"
# Base docstring
_CHECKPOINT_FOR_DOC = "MBZUAI/swiftformer-xs"
_EXPECTED_OUTPUT_SHAPE = [1, 220, 7, 7]
# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "MBZUAI/swiftformer-xs"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"MBZUAI/swiftformer-xs",
# See all SwiftFormer models at https://huggingface.co/models?filter=swiftformer
]
class SwiftFormerPatchEmbedding(nn.Module):
"""
Patch Embedding Layer constructed of two 2D convolutional layers.
Input: tensor of shape `[batch_size, in_channels, height, width]`
Output: tensor of shape `[batch_size, out_channels, height/4, width/4]`
"""
def __init__(self, config: SwiftFormerConfig):
super().__init__()
in_chs = config.num_channels
out_chs = config.embed_dims[0]
self.patch_embedding = nn.Sequential(
nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(out_chs // 2, eps=config.batch_norm_eps),
nn.ReLU(),
nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(out_chs, eps=config.batch_norm_eps),
nn.ReLU(),
)
def forward(self, x):
return self.patch_embedding(x)
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(x, drop_prob: float = 0.0, training: bool = False):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
output = input.div(keep_prob) * random_tensor
return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Swiftformer
class SwiftFormerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class SwiftFormerEmbeddings(nn.Module):
"""
Embeddings layer consisting of a single 2D convolutional and batch normalization layer.
Input: tensor of shape `[batch_size, channels, height, width]`
Output: tensor of shape `[batch_size, channels, height/stride, width/stride]`
"""
def __init__(self, config: SwiftFormerConfig, index: int):
super().__init__()
patch_size = config.down_patch_size
stride = config.down_stride
padding = config.down_pad
embed_dims = config.embed_dims
in_chans = embed_dims[index]
embed_dim = embed_dims[index + 1]
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
padding = padding if isinstance(padding, collections.abc.Iterable) else (padding, padding)
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps)
def forward(self, x):
x = self.proj(x)
x = self.norm(x)
return x
class SwiftFormerConvEncoder(nn.Module):
"""
`SwiftFormerConvEncoder` with 3*3 and 1*1 convolutions.
Input: tensor of shape `[batch_size, channels, height, width]`
Output: tensor of shape `[batch_size, channels, height, width]`
"""
def __init__(self, config: SwiftFormerConfig, dim: int):
super().__init__()
hidden_dim = int(config.mlp_ratio * dim)
self.depth_wise_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
self.norm = nn.BatchNorm2d(dim, eps=config.batch_norm_eps)
self.point_wise_conv1 = nn.Conv2d(dim, hidden_dim, kernel_size=1)
self.act = nn.GELU()
self.point_wise_conv2 = nn.Conv2d(hidden_dim, dim, kernel_size=1)
self.drop_path = nn.Identity()
self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)
def forward(self, x):
input = x
x = self.depth_wise_conv(x)
x = self.norm(x)
x = self.point_wise_conv1(x)
x = self.act(x)
x = self.point_wise_conv2(x)
x = input + self.drop_path(self.layer_scale * x)
return x
class SwiftFormerMlp(nn.Module):
"""
MLP layer with 1*1 convolutions.
Input: tensor of shape `[batch_size, channels, height, width]`
Output: tensor of shape `[batch_size, channels, height, width]`
"""
def __init__(self, config: SwiftFormerConfig, in_features: int):
super().__init__()
hidden_features = int(in_features * config.mlp_ratio)
self.norm1 = nn.BatchNorm2d(in_features, eps=config.batch_norm_eps)
self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
act_layer = ACT2CLS[config.hidden_act]
self.act = act_layer()
self.fc2 = nn.Conv2d(hidden_features, in_features, 1)
self.drop = nn.Dropout(p=0.0)
def forward(self, x):
x = self.norm1(x)
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class SwiftFormerEfficientAdditiveAttention(nn.Module):
"""
Efficient Additive Attention module for SwiftFormer.
Input: tensor of shape `[batch_size, channels, height, width]`
Output: tensor of shape `[batch_size, channels, height, width]`
"""
def __init__(self, config: SwiftFormerConfig, dim: int = 512):
super().__init__()
self.to_query = nn.Linear(dim, dim)
self.to_key = nn.Linear(dim, dim)
self.w_g = nn.Parameter(torch.randn(dim, 1))
self.scale_factor = dim**-0.5
self.proj = nn.Linear(dim, dim)
self.final = nn.Linear(dim, dim)
def forward(self, x):
query = self.to_query(x)
key = self.to_key(x)
query = torch.nn.functional.normalize(query, dim=-1)
key = torch.nn.functional.normalize(key, dim=-1)
query_weight = query @ self.w_g
scaled_query_weight = query_weight * self.scale_factor
scaled_query_weight = scaled_query_weight.softmax(dim=-1)
global_queries = torch.sum(scaled_query_weight * query, dim=1)
global_queries = global_queries.unsqueeze(1).repeat(1, key.shape[1], 1)
out = self.proj(global_queries * key) + query
out = self.final(out)
return out
class SwiftFormerLocalRepresentation(nn.Module):
"""
Local Representation module for SwiftFormer that is implemented by 3*3 depth-wise and point-wise convolutions.
Input: tensor of shape `[batch_size, channels, height, width]`
Output: tensor of shape `[batch_size, channels, height, width]`
"""
def __init__(self, config: SwiftFormerConfig, dim: int):
super().__init__()
self.depth_wise_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
self.norm = nn.BatchNorm2d(dim, eps=config.batch_norm_eps)
self.point_wise_conv1 = nn.Conv2d(dim, dim, kernel_size=1)
self.act = nn.GELU()
self.point_wise_conv2 = nn.Conv2d(dim, dim, kernel_size=1)
self.drop_path = nn.Identity()
self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)
def forward(self, x):
input = x
x = self.depth_wise_conv(x)
x = self.norm(x)
x = self.point_wise_conv1(x)
x = self.act(x)
x = self.point_wise_conv2(x)
x = input + self.drop_path(self.layer_scale * x)
return x
class SwiftFormerEncoderBlock(nn.Module):
"""
SwiftFormer Encoder Block for SwiftFormer. It consists of (1) Local representation module, (2)
SwiftFormerEfficientAdditiveAttention, and (3) MLP block.
Input: tensor of shape `[batch_size, channels, height, width]`
Output: tensor of shape `[batch_size, channels,height, width]`
"""
def __init__(self, config: SwiftFormerConfig, dim: int, drop_path: float = 0.0) -> None:
super().__init__()
layer_scale_init_value = config.layer_scale_init_value
use_layer_scale = config.use_layer_scale
self.local_representation = SwiftFormerLocalRepresentation(config, dim=dim)
self.attn = SwiftFormerEfficientAdditiveAttention(config, dim=dim)
self.linear = SwiftFormerMlp(config, in_features=dim)
self.drop_path = SwiftFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.use_layer_scale = use_layer_scale
if use_layer_scale:
self.layer_scale_1 = nn.Parameter(
layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
)
self.layer_scale_2 = nn.Parameter(
layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
)
def forward(self, x):
x = self.local_representation(x)
batch_size, channels, height, width = x.shape
if self.use_layer_scale:
x = x + self.drop_path(
self.layer_scale_1
* self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
.reshape(batch_size, height, width, channels)
.permute(0, 3, 1, 2)
)
x = x + self.drop_path(self.layer_scale_2 * self.linear(x))
else:
x = x + self.drop_path(
self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
.reshape(batch_size, height, width, channels)
.permute(0, 3, 1, 2)
)
x = x + self.drop_path(self.linear(x))
return x
class SwiftFormerStage(nn.Module):
"""
A Swiftformer stage consisting of a series of `SwiftFormerConvEncoder` blocks and a final
`SwiftFormerEncoderBlock`.
Input: tensor in shape `[batch_size, channels, height, width]`
Output: tensor in shape `[batch_size, channels, height, width]`
"""
def __init__(self, config: SwiftFormerConfig, index: int) -> None:
super().__init__()
layer_depths = config.depths
dim = config.embed_dims[index]
depth = layer_depths[index]
blocks = []
for block_idx in range(depth):
block_dpr = config.drop_path_rate * (block_idx + sum(layer_depths[:index])) / (sum(layer_depths) - 1)
if depth - block_idx <= 1:
blocks.append(SwiftFormerEncoderBlock(config, dim=dim, drop_path=block_dpr))
else:
blocks.append(SwiftFormerConvEncoder(config, dim=dim))
self.blocks = nn.ModuleList(blocks)
def forward(self, input):
for block in self.blocks:
input = block(input)
return input
class SwiftFormerEncoder(nn.Module):
def __init__(self, config: SwiftFormerConfig) -> None:
super().__init__()
self.config = config
embed_dims = config.embed_dims
downsamples = config.downsamples
layer_depths = config.depths
# Transformer model
network = []
for i in range(len(layer_depths)):
stage = SwiftFormerStage(config=config, index=i)
network.append(stage)
if i >= len(layer_depths) - 1:
break
if downsamples[i] or embed_dims[i] != embed_dims[i + 1]:
# downsampling between two stages
network.append(SwiftFormerEmbeddings(config, index=i))
self.network = nn.ModuleList(network)
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutputWithNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
all_hidden_states = (hidden_states,) if output_hidden_states else None
for block in self.network:
hidden_states = block(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
)
class SwiftFormerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SwiftFormerConfig
base_model_prefix = "swiftformer"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""
if isinstance(module, (nn.Conv2d, nn.Linear)):
nn.init.trunc_normal_(module.weight, std=0.02)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
elif isinstance(module, (nn.LayerNorm)):
nn.init.constant_(module.bias, 0)
nn.init.constant_(module.weight, 1.0)
def _set_gradient_checkpointing(self, module: SwiftFormerEncoder, value: bool = False) -> None:
if isinstance(module, SwiftFormerEncoder):
module.gradient_checkpointing = value
SWIFTFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`SwiftFormerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SWIFTFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare SwiftFormer Model transformer outputting raw hidden-states without any specific head on top.",
SWIFTFORMER_START_DOCSTRING,
)
class SwiftFormerModel(SwiftFormerPreTrainedModel):
def __init__(self, config: SwiftFormerConfig):
super().__init__(config)
self.config = config
self.patch_embed = SwiftFormerPatchEmbedding(config)
self.encoder = SwiftFormerEncoder(config)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(SWIFTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithNoAttention]:
r""" """
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.patch_embed(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return tuple(v for v in encoder_outputs if v is not None)
return BaseModelOutputWithNoAttention(
last_hidden_state=encoder_outputs.last_hidden_state,
hidden_states=encoder_outputs.hidden_states,
)
@add_start_docstrings(
"""
SwiftFormer Model transformer with an image classification head on top (e.g. for ImageNet).
""",
SWIFTFORMER_START_DOCSTRING,
)
class SwiftFormerForImageClassification(SwiftFormerPreTrainedModel):
def __init__(self, config: SwiftFormerConfig) -> None:
super().__init__(config)
embed_dims = config.embed_dims
self.num_labels = config.num_labels
self.swiftformer = SwiftFormerModel(config)
# Classifier head
self.norm = nn.BatchNorm2d(embed_dims[-1], eps=config.batch_norm_eps)
self.head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()
self.dist_head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(SWIFTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# run base model
outputs = self.swiftformer(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs.last_hidden_state if return_dict else outputs[0]
# run classification head
sequence_output = self.norm(sequence_output)
sequence_output = sequence_output.flatten(2).mean(-1)
cls_out = self.head(sequence_output)
distillation_out = self.dist_head(sequence_output)
logits = (cls_out + distillation_out) / 2
# calculate loss
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
...@@ -6440,6 +6440,30 @@ class SqueezeBertPreTrainedModel(metaclass=DummyObject): ...@@ -6440,6 +6440,30 @@ class SqueezeBertPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
class SwiftFormerForImageClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SwiftFormerModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SwiftFormerPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch SwiftFormer model. """
import copy
import inspect
import unittest
from transformers import PretrainedConfig, SwiftFormerConfig
from transformers.testing_utils import (
require_torch,
require_vision,
slow,
torch_device,
)
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from torch import nn
from transformers import SwiftFormerForImageClassification, SwiftFormerModel
from transformers.models.swiftformer.modeling_swiftformer import SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import ViTImageProcessor
class SwiftFormerModelTester:
def __init__(
self,
parent,
batch_size=13,
num_channels=3,
is_training=True,
use_labels=True,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
image_size=224,
num_labels=1000,
layer_depths=[3, 3, 6, 4],
embed_dims=[48, 56, 112, 220],
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.is_training = is_training
self.use_labels = use_labels
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.num_labels = num_labels
self.image_size = image_size
self.layer_depths = layer_depths
self.embed_dims = embed_dims
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.num_labels)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return SwiftFormerConfig(
depths=self.layer_depths,
embed_dims=self.embed_dims,
mlp_ratio=4,
downsamples=[True, True, True, True],
hidden_act="gelu",
num_labels=self.num_labels,
down_patch_size=3,
down_stride=2,
down_pad=1,
drop_rate=0.0,
drop_path_rate=0.0,
use_layer_scale=True,
layer_scale_init_value=1e-5,
)
def create_and_check_model(self, config, pixel_values, labels):
model = SwiftFormerModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dims[-1], 7, 7))
def create_and_check_for_image_classification(self, config, pixel_values, labels):
config.num_labels = self.num_labels
model = SwiftFormerForImageClassification(config)
model.to(torch_device)
model.eval()
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
model = SwiftFormerForImageClassification(config)
model.to(torch_device)
model.eval()
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
result = model(pixel_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def prepare_config_and_inputs_for_common(self):
(config, pixel_values, labels) = self.prepare_config_and_inputs()
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as SwiftFormer does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (SwiftFormerModel, SwiftFormerForImageClassification) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": SwiftFormerModel, "image-classification": SwiftFormerForImageClassification}
if is_torch_available()
else {}
)
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
has_attentions = False
def setUp(self):
self.model_tester = SwiftFormerModelTester(self)
self.config_tester = ConfigTester(
self,
config_class=SwiftFormerConfig,
has_text_modality=False,
hidden_size=37,
num_attention_heads=12,
num_hidden_layers=12,
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="SwiftFormer does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_image_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = SwiftFormerModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="SwiftFormer does not output attentions")
def test_attention_outputs(self):
pass
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_stages = 8
self.assertEqual(len(hidden_states), expected_num_stages) # TODO
# SwiftFormer's feature maps are of shape (batch_size, embed_dims, height, width)
# with the width and height being successively divided by 2, after every 2 blocks
for i in range(len(hidden_states)):
self.assertEqual(
hidden_states[i].shape,
torch.Size(
[
self.model_tester.batch_size,
self.model_tester.embed_dims[i // 2],
(self.model_tester.image_size // 4) // 2 ** (i // 2),
(self.model_tester.image_size // 4) // 2 ** (i // 2),
]
),
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
def test_initialization(self):
def _config_zero_init(config):
configs_no_init = copy.deepcopy(config)
for key in configs_no_init.__dict__.keys():
if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
setattr(configs_no_init, key, 1e-10)
if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
setattr(configs_no_init, key, no_init_subconfig)
return configs_no_init
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9) / 1e9).round().item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torch
@require_vision
class SwiftFormerModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = SwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs").to(torch_device)
feature_extractor = self.default_feature_extractor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 1000))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([[-2.1703e00, 2.1107e00, -2.0811e00]])
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment