[SegFormer] Remove unused attributes (#16285)

* Remove unused attributes * Add link to blog and add clarification about input size * Improve readability of the code Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>

[SegFormer] Remove unused attributes (#16285)
* Remove unused attributes * Add link to blog and add clarification about input size * Improve readability of the code Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
fbb45430 · NielsRogge · GitHub · f0c00d8c · fbb45430 · fbb45430
Unverified Commit fbb45430 authored Mar 21, 2022 by NielsRogge Committed by GitHub Mar 21, 2022
3 changed files
--- a/docs/source/model_doc/segformer.mdx
+++ b/docs/source/model_doc/segformer.mdx
@@ -50,7 +50,8 @@ Tips:
  ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
  found on the [hub](https://huggingface.co/models?other=segformer).
 - The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
-  fine-tuning on custom data).
+  fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
+- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. 
 - One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
  for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
  the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most

--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -40,8 +40,6 @@ class SegformerConfig(PretrainedConfig):
    documentation from [`PretrainedConfig`] for more information.
    Args:
-        image_size (`int`, *optional*, defaults to 512):
-            The size (resolution) of each image.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        num_encoder_blocks (`int`, *optional*, defaults to 4):
@@ -52,8 +50,6 @@ class SegformerConfig(PretrainedConfig):
            Sequence reduction ratios in each encoder block.
        hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
            Dimension of each of the encoder blocks.
-        downsampling_rates (`List[int]`, *optional*, defaults to [1, 4, 8, 16]):
-            Downsample rate of the image resolution compared to the original image size before each encoder block.
        patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
            Patch size before each encoder block.
        strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
@@ -101,13 +97,11 @@ class SegformerConfig(PretrainedConfig):
    def __init__(
        self,
-        image_size=224,
        num_channels=3,
        num_encoder_blocks=4,
        depths=[2, 2, 2, 2],
        sr_ratios=[8, 4, 2, 1],
        hidden_sizes=[32, 64, 160, 256],
-        downsampling_rates=[1, 4, 8, 16],
        patch_sizes=[7, 3, 3, 3],
        strides=[4, 2, 2, 2],
        num_attention_heads=[1, 2, 5, 8],
@@ -133,13 +127,11 @@ class SegformerConfig(PretrainedConfig):
                FutureWarning,
            )
-        self.image_size = image_size
        self.num_channels = num_channels
        self.num_encoder_blocks = num_encoder_blocks
        self.depths = depths
        self.sr_ratios = sr_ratios
        self.hidden_sizes = hidden_sizes
-        self.downsampling_rates = downsampling_rates
        self.patch_sizes = patch_sizes
        self.strides = strides
        self.mlp_ratios = mlp_ratios

--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -15,7 +15,6 @@
 """ PyTorch SegFormer model."""
-import collections
 import math
 from typing import Optional, Tuple, Union
@@ -58,18 +57,8 @@ SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
-# Inspired by
+# Copied from transformers.models.convnext.modeling_convnext.drop_path
-# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
-# From PyTorch internals
-def to_2tuple(x):
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
-# Stochastic depth implementation
-# Taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
-def drop_path(x, drop_prob: float = 0.0, training: bool = False):
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the
    DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop
@@ -87,7 +76,8 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
    return output
-class DropPath(nn.Module):
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Segformer
+class SegformerDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
    def __init__(self, drop_prob=None):
@@ -99,34 +89,35 @@ class DropPath(nn.Module):
 class SegformerOverlapPatchEmbeddings(nn.Module):
-    """Construct the patch embeddings from an image."""
+    """Construct the overlapping patch embeddings."""
-    def __init__(self, image_size, patch_size, stride, num_channels, hidden_size):
+    def __init__(self, patch_size, stride, num_channels, hidden_size):
        super().__init__()
-        image_size = to_2tuple(image_size)
-        patch_size = to_2tuple(patch_size)
-        self.height, self.width = image_size[0] // patch_size[0], image_size[1] // patch_size[1]
-        self.num_patches = self.height * self.width
        self.proj = nn.Conv2d(
            num_channels,
            hidden_size,
            kernel_size=patch_size,
            stride=stride,
-            padding=(patch_size[0] // 2, patch_size[1] // 2),
+            padding=patch_size // 2,
        )
        self.layer_norm = nn.LayerNorm(hidden_size)
    def forward(self, pixel_values):
-        x = self.proj(pixel_values)
+        embeddings = self.proj(pixel_values)
-        _, _, height, width = x.shape
+        _, _, height, width = embeddings.shape
-        x = x.flatten(2).transpose(1, 2)
+        # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels)
-        x = self.layer_norm(x)
+        # this can be fed to a Transformer layer
-        return x, height, width
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
 class SegformerEfficientSelfAttention(nn.Module):
-    def __init__(self, config, hidden_size, num_attention_heads, sr_ratio):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://arxiv.org/abs/2102.12122)."""
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
@@ -146,15 +137,17 @@ class SegformerEfficientSelfAttention(nn.Module):
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.sr_ratio = sr_ratio
+        self.sr_ratio = sequence_reduction_ratio
-        if sr_ratio > 1:
+        if sequence_reduction_ratio > 1:
-            self.sr = nn.Conv2d(hidden_size, hidden_size, kernel_size=sr_ratio, stride=sr_ratio)
+            self.sr = nn.Conv2d(
+                hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
+            )
            self.layer_norm = nn.LayerNorm(hidden_size)
-    def transpose_for_scores(self, x):
+    def transpose_for_scores(self, hidden_states):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
+        hidden_states = hidden_states.view(*new_shape)
-        return x.permute(0, 2, 1, 3)
+        return hidden_states.permute(0, 2, 1, 3)
    def forward(
        self,
@@ -167,8 +160,11 @@ class SegformerEfficientSelfAttention(nn.Module):
        if self.sr_ratio > 1:
            batch_size, seq_len, num_channels = hidden_states.shape
+            # Reshape to (batch_size, num_channels, height, width)
            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            # Apply sequence reduction
            hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
            hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
            hidden_states = self.layer_norm(hidden_states)
@@ -211,10 +207,13 @@ class SegformerSelfOutput(nn.Module):
 class SegformerAttention(nn.Module):
-    def __init__(self, config, hidden_size, num_attention_heads, sr_ratio):
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
        super().__init__()
        self.self = SegformerEfficientSelfAttention(
-            config=config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
        )
        self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
        self.pruned_heads = set()
@@ -285,13 +284,16 @@ class SegformerMixFFN(nn.Module):
 class SegformerLayer(nn.Module):
    """This corresponds to the Block class in the original implementation."""
-    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sr_ratio, mlp_ratio):
+    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(hidden_size)
        self.attention = SegformerAttention(
-            config, hidden_size=hidden_size, num_attention_heads=num_attention_heads, sr_ratio=sr_ratio
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.layer_norm_2 = nn.LayerNorm(hidden_size)
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
@@ -328,14 +330,13 @@ class SegformerEncoder(nn.Module):
        self.config = config
        # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        # patch embeddings
        embeddings = []
        for i in range(config.num_encoder_blocks):
            embeddings.append(
                SegformerOverlapPatchEmbeddings(
-                    image_size=config.image_size // config.downsampling_rates[i],
                    patch_size=config.patch_sizes[i],
                    stride=config.strides[i],
                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
@@ -358,8 +359,8 @@ class SegformerEncoder(nn.Module):
                        config,
                        hidden_size=config.hidden_sizes[i],
                        num_attention_heads=config.num_attention_heads[i],
-                        drop_path=dpr[cur + j],
+                        drop_path=drop_path_decays[cur + j],
-                        sr_ratio=config.sr_ratios[i],
+                        sequence_reduction_ratio=config.sr_ratios[i],
                        mlp_ratio=config.mlp_ratios[i],
                    )
                )