Unverified Commit fbb45430 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

[SegFormer] Remove unused attributes (#16285)



* Remove unused attributes

* Add link to blog and add clarification about input size

* Improve readability of the code
Co-authored-by: default avatarNiels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
parent f0c00d8c
...@@ -50,7 +50,8 @@ Tips: ...@@ -50,7 +50,8 @@ Tips:
ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
found on the [hub](https://huggingface.co/models?other=segformer). found on the [hub](https://huggingface.co/models?other=segformer).
- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and - The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
fine-tuning on custom data). fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`.
- One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps - One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most
......
...@@ -40,8 +40,6 @@ class SegformerConfig(PretrainedConfig): ...@@ -40,8 +40,6 @@ class SegformerConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information. documentation from [`PretrainedConfig`] for more information.
Args: Args:
image_size (`int`, *optional*, defaults to 512):
The size (resolution) of each image.
num_channels (`int`, *optional*, defaults to 3): num_channels (`int`, *optional*, defaults to 3):
The number of input channels. The number of input channels.
num_encoder_blocks (`int`, *optional*, defaults to 4): num_encoder_blocks (`int`, *optional*, defaults to 4):
...@@ -52,8 +50,6 @@ class SegformerConfig(PretrainedConfig): ...@@ -52,8 +50,6 @@ class SegformerConfig(PretrainedConfig):
Sequence reduction ratios in each encoder block. Sequence reduction ratios in each encoder block.
hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]): hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
Dimension of each of the encoder blocks. Dimension of each of the encoder blocks.
downsampling_rates (`List[int]`, *optional*, defaults to [1, 4, 8, 16]):
Downsample rate of the image resolution compared to the original image size before each encoder block.
patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]): patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
Patch size before each encoder block. Patch size before each encoder block.
strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]): strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
...@@ -101,13 +97,11 @@ class SegformerConfig(PretrainedConfig): ...@@ -101,13 +97,11 @@ class SegformerConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
image_size=224,
num_channels=3, num_channels=3,
num_encoder_blocks=4, num_encoder_blocks=4,
depths=[2, 2, 2, 2], depths=[2, 2, 2, 2],
sr_ratios=[8, 4, 2, 1], sr_ratios=[8, 4, 2, 1],
hidden_sizes=[32, 64, 160, 256], hidden_sizes=[32, 64, 160, 256],
downsampling_rates=[1, 4, 8, 16],
patch_sizes=[7, 3, 3, 3], patch_sizes=[7, 3, 3, 3],
strides=[4, 2, 2, 2], strides=[4, 2, 2, 2],
num_attention_heads=[1, 2, 5, 8], num_attention_heads=[1, 2, 5, 8],
...@@ -133,13 +127,11 @@ class SegformerConfig(PretrainedConfig): ...@@ -133,13 +127,11 @@ class SegformerConfig(PretrainedConfig):
FutureWarning, FutureWarning,
) )
self.image_size = image_size
self.num_channels = num_channels self.num_channels = num_channels
self.num_encoder_blocks = num_encoder_blocks self.num_encoder_blocks = num_encoder_blocks
self.depths = depths self.depths = depths
self.sr_ratios = sr_ratios self.sr_ratios = sr_ratios
self.hidden_sizes = hidden_sizes self.hidden_sizes = hidden_sizes
self.downsampling_rates = downsampling_rates
self.patch_sizes = patch_sizes self.patch_sizes = patch_sizes
self.strides = strides self.strides = strides
self.mlp_ratios = mlp_ratios self.mlp_ratios = mlp_ratios
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
""" PyTorch SegFormer model.""" """ PyTorch SegFormer model."""
import collections
import math import math
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
...@@ -58,18 +57,8 @@ SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -58,18 +57,8 @@ SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
# Inspired by # Copied from transformers.models.convnext.modeling_convnext.drop_path
# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
# From PyTorch internals
def to_2tuple(x):
if isinstance(x, collections.abc.Iterable):
return x
return (x, x)
# Stochastic depth implementation
# Taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
def drop_path(x, drop_prob: float = 0.0, training: bool = False):
""" """
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the
DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop
...@@ -87,7 +76,8 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False): ...@@ -87,7 +76,8 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
return output return output
class DropPath(nn.Module): # Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Segformer
class SegformerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob=None): def __init__(self, drop_prob=None):
...@@ -99,34 +89,35 @@ class DropPath(nn.Module): ...@@ -99,34 +89,35 @@ class DropPath(nn.Module):
class SegformerOverlapPatchEmbeddings(nn.Module): class SegformerOverlapPatchEmbeddings(nn.Module):
"""Construct the patch embeddings from an image.""" """Construct the overlapping patch embeddings."""
def __init__(self, image_size, patch_size, stride, num_channels, hidden_size): def __init__(self, patch_size, stride, num_channels, hidden_size):
super().__init__() super().__init__()
image_size = to_2tuple(image_size)
patch_size = to_2tuple(patch_size)
self.height, self.width = image_size[0] // patch_size[0], image_size[1] // patch_size[1]
self.num_patches = self.height * self.width
self.proj = nn.Conv2d( self.proj = nn.Conv2d(
num_channels, num_channels,
hidden_size, hidden_size,
kernel_size=patch_size, kernel_size=patch_size,
stride=stride, stride=stride,
padding=(patch_size[0] // 2, patch_size[1] // 2), padding=patch_size // 2,
) )
self.layer_norm = nn.LayerNorm(hidden_size) self.layer_norm = nn.LayerNorm(hidden_size)
def forward(self, pixel_values): def forward(self, pixel_values):
x = self.proj(pixel_values) embeddings = self.proj(pixel_values)
_, _, height, width = x.shape _, _, height, width = embeddings.shape
x = x.flatten(2).transpose(1, 2) # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels)
x = self.layer_norm(x) # this can be fed to a Transformer layer
return x, height, width embeddings = embeddings.flatten(2).transpose(1, 2)
embeddings = self.layer_norm(embeddings)
return embeddings, height, width
class SegformerEfficientSelfAttention(nn.Module): class SegformerEfficientSelfAttention(nn.Module):
def __init__(self, config, hidden_size, num_attention_heads, sr_ratio): """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
paper](https://arxiv.org/abs/2102.12122)."""
def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
super().__init__() super().__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
...@@ -146,15 +137,17 @@ class SegformerEfficientSelfAttention(nn.Module): ...@@ -146,15 +137,17 @@ class SegformerEfficientSelfAttention(nn.Module):
self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.sr_ratio = sr_ratio self.sr_ratio = sequence_reduction_ratio
if sr_ratio > 1: if sequence_reduction_ratio > 1:
self.sr = nn.Conv2d(hidden_size, hidden_size, kernel_size=sr_ratio, stride=sr_ratio) self.sr = nn.Conv2d(
hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
)
self.layer_norm = nn.LayerNorm(hidden_size) self.layer_norm = nn.LayerNorm(hidden_size)
def transpose_for_scores(self, x): def transpose_for_scores(self, hidden_states):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape) hidden_states = hidden_states.view(*new_shape)
return x.permute(0, 2, 1, 3) return hidden_states.permute(0, 2, 1, 3)
def forward( def forward(
self, self,
...@@ -167,8 +160,11 @@ class SegformerEfficientSelfAttention(nn.Module): ...@@ -167,8 +160,11 @@ class SegformerEfficientSelfAttention(nn.Module):
if self.sr_ratio > 1: if self.sr_ratio > 1:
batch_size, seq_len, num_channels = hidden_states.shape batch_size, seq_len, num_channels = hidden_states.shape
# Reshape to (batch_size, num_channels, height, width)
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width) hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
# Apply sequence reduction
hidden_states = self.sr(hidden_states) hidden_states = self.sr(hidden_states)
# Reshape back to (batch_size, seq_len, num_channels)
hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1) hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
hidden_states = self.layer_norm(hidden_states) hidden_states = self.layer_norm(hidden_states)
...@@ -211,10 +207,13 @@ class SegformerSelfOutput(nn.Module): ...@@ -211,10 +207,13 @@ class SegformerSelfOutput(nn.Module):
class SegformerAttention(nn.Module): class SegformerAttention(nn.Module):
def __init__(self, config, hidden_size, num_attention_heads, sr_ratio): def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
super().__init__() super().__init__()
self.self = SegformerEfficientSelfAttention( self.self = SegformerEfficientSelfAttention(
config=config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio config=config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
) )
self.output = SegformerSelfOutput(config, hidden_size=hidden_size) self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
self.pruned_heads = set() self.pruned_heads = set()
...@@ -285,13 +284,16 @@ class SegformerMixFFN(nn.Module): ...@@ -285,13 +284,16 @@ class SegformerMixFFN(nn.Module):
class SegformerLayer(nn.Module): class SegformerLayer(nn.Module):
"""This corresponds to the Block class in the original implementation.""" """This corresponds to the Block class in the original implementation."""
def __init__(self, config, hidden_size, num_attention_heads, drop_path, sr_ratio, mlp_ratio): def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
super().__init__() super().__init__()
self.layer_norm_1 = nn.LayerNorm(hidden_size) self.layer_norm_1 = nn.LayerNorm(hidden_size)
self.attention = SegformerAttention( self.attention = SegformerAttention(
config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
) )
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.layer_norm_2 = nn.LayerNorm(hidden_size) self.layer_norm_2 = nn.LayerNorm(hidden_size)
mlp_hidden_size = int(hidden_size * mlp_ratio) mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size) self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
...@@ -328,14 +330,13 @@ class SegformerEncoder(nn.Module): ...@@ -328,14 +330,13 @@ class SegformerEncoder(nn.Module):
self.config = config self.config = config
# stochastic depth decay rule # stochastic depth decay rule
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
# patch embeddings # patch embeddings
embeddings = [] embeddings = []
for i in range(config.num_encoder_blocks): for i in range(config.num_encoder_blocks):
embeddings.append( embeddings.append(
SegformerOverlapPatchEmbeddings( SegformerOverlapPatchEmbeddings(
image_size=config.image_size // config.downsampling_rates[i],
patch_size=config.patch_sizes[i], patch_size=config.patch_sizes[i],
stride=config.strides[i], stride=config.strides[i],
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1], num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
...@@ -358,8 +359,8 @@ class SegformerEncoder(nn.Module): ...@@ -358,8 +359,8 @@ class SegformerEncoder(nn.Module):
config, config,
hidden_size=config.hidden_sizes[i], hidden_size=config.hidden_sizes[i],
num_attention_heads=config.num_attention_heads[i], num_attention_heads=config.num_attention_heads[i],
drop_path=dpr[cur + j], drop_path=drop_path_decays[cur + j],
sr_ratio=config.sr_ratios[i], sequence_reduction_ratio=config.sr_ratios[i],
mlp_ratio=config.mlp_ratios[i], mlp_ratio=config.mlp_ratios[i],
) )
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment