Backbone add out indices (#22493)

* Add out_indices to backbones, deprecate out_features * Update - can specify both out_features and out_indices but not both * Can specify both * Fix copies * Add out_indices to convnextv2 configuration

Backbone add out indices (#22493)
* Add out_indices to backbones, deprecate out_features * Update - can specify both out_features and out_indices but not both * Can specify both * Fix copies * Add out_indices to convnextv2 configuration
559a45d1 · amyeroberts · GitHub · db803b69 · 559a45d1 · 559a45d1
Unverified Commit 559a45d1 authored Apr 03, 2023 by amyeroberts Committed by GitHub Apr 03, 2023
16 changed files
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -63,7 +63,12 @@ class BitConfig(PretrainedConfig):
            The width factor for the model.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
    ```python
@@ -98,6 +103,7 @@ class BitConfig(PretrainedConfig):
        output_stride=32,
        width_factor=1,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -122,6 +128,21 @@ class BitConfig(PretrainedConfig):
        self.width_factor = width_factor
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -130,4 +151,12 @@ class BitConfig(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -850,6 +850,10 @@ class BitBackbone(BitPreTrainedModel, BackboneMixin):
        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
        self.num_features = [config.embedding_size] + config.hidden_sizes
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        # initialize weights and apply final processing
        self.post_init()

--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@@ -66,7 +66,12 @@ class ConvNextConfig(PretrainedConfig):
            The drop rate for stochastic depth.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
    ```python
@@ -97,6 +102,7 @@ class ConvNextConfig(PretrainedConfig):
        drop_path_rate=0.0,
        image_size=224,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -113,6 +119,21 @@ class ConvNextConfig(PretrainedConfig):
        self.drop_path_rate = drop_path_rate
        self.image_size = image_size
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -121,7 +142,15 @@ class ConvNextConfig(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
 class ConvNextOnnxConfig(OnnxConfig):

--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -487,6 +487,10 @@ class ConvNextBackbone(ConvNextPreTrainedModel, BackboneMixin):
        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
        self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        # Add layer norms to hidden states of out_features
        hidden_states_norms = {}

--- a/src/transformers/models/convnextv2/configuration_convnextv2.py
+++ b/src/transformers/models/convnextv2/configuration_convnextv2.py
@@ -58,7 +58,12 @@ class ConvNextV2Config(PretrainedConfig):
            The drop rate for stochastic depth.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
    ```python
@@ -88,6 +93,7 @@ class ConvNextV2Config(PretrainedConfig):
        drop_path_rate=0.0,
        image_size=224,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -103,6 +109,21 @@ class ConvNextV2Config(PretrainedConfig):
        self.drop_path_rate = drop_path_rate
        self.image_size = image_size
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -111,4 +132,12 @@ class ConvNextV2Config(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -510,6 +510,10 @@ class ConvNextV2Backbone(ConvNextV2PreTrainedModel, BackboneMixin):
        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
        self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        # Add layer norms to hidden states of out_features
        hidden_states_norms = {}

--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
@@ -72,7 +72,12 @@ class DinatConfig(PretrainedConfig):
            The initial value for the layer scale. Disabled if <=0.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
@@ -114,6 +119,7 @@ class DinatConfig(PretrainedConfig):
        layer_norm_eps=1e-5,
        layer_scale_init_value=0.0,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -139,6 +145,21 @@ class DinatConfig(PretrainedConfig):
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
        self.layer_scale_init_value = layer_scale_init_value
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -147,4 +168,12 @@ class DinatConfig(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -891,6 +891,10 @@ class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
        self.encoder = DinatEncoder(config)
        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
        # Add layer norms to hidden states of out_features

--- a/src/transformers/models/maskformer/configuration_maskformer_swin.py
+++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py
@@ -68,7 +68,12 @@ class MaskFormerSwinConfig(PretrainedConfig):
            The epsilon used by the layer normalization layers.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
@@ -110,6 +115,7 @@ class MaskFormerSwinConfig(PretrainedConfig):
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -135,6 +141,21 @@ class MaskFormerSwinConfig(PretrainedConfig):
        # this indicates the channel dimension after the last stage of the model
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -143,4 +164,12 @@ class MaskFormerSwinConfig(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -859,6 +859,10 @@ class MaskFormerSwinBackbone(MaskFormerSwinPreTrainedModel, BackboneMixin):
        if "stem" in self.out_features:
            raise ValueError("This backbone does not support 'stem' in the `out_features`.")
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
        self.hidden_states_norms = nn.ModuleList([nn.LayerNorm(num_channels) for num_channels in self.channels])

--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/nat/configuration_nat.py
@@ -70,7 +70,12 @@ class NatConfig(PretrainedConfig):
            The initial value for the layer scale. Disabled if <=0.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
@@ -111,6 +116,7 @@ class NatConfig(PretrainedConfig):
        layer_norm_eps=1e-5,
        layer_scale_init_value=0.0,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -135,6 +141,21 @@ class NatConfig(PretrainedConfig):
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
        self.layer_scale_init_value = layer_scale_init_value
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -143,4 +164,12 @@ class NatConfig(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -869,6 +869,10 @@ class NatBackbone(NatPreTrainedModel, BackboneMixin):
        self.encoder = NatEncoder(config)
        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
        # Add layer norms to hidden states of out_features

--- a/src/transformers/models/resnet/configuration_resnet.py
+++ b/src/transformers/models/resnet/configuration_resnet.py
@@ -60,7 +60,12 @@ class ResNetConfig(PretrainedConfig):
            If `True`, the first stage will downsample the inputs using a `stride` of 2.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
    ```python
@@ -89,6 +94,7 @@ class ResNetConfig(PretrainedConfig):
        hidden_act="relu",
        downsample_in_first_stage=False,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -102,6 +108,21 @@ class ResNetConfig(PretrainedConfig):
        self.hidden_act = hidden_act
        self.downsample_in_first_stage = downsample_in_first_stage
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -110,7 +131,15 @@ class ResNetConfig(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
 class ResNetOnnxConfig(OnnxConfig):

--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -437,6 +437,10 @@ class ResNetBackbone(ResNetPreTrainedModel, BackboneMixin):
        self.encoder = ResNetEncoder(config)
        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        self.num_features = [config.embedding_size] + config.hidden_sizes
        # initialize weights and apply final processing

--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -83,7 +83,12 @@ class SwinConfig(PretrainedConfig):
            Factor to increase the spatial resolution by in the decoder head for masked image modeling.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
-            (depending on how many stages the model has). Will default to the last stage if unset.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
    Example:
@@ -126,6 +131,7 @@ class SwinConfig(PretrainedConfig):
        layer_norm_eps=1e-5,
        encoder_stride=32,
        out_features=None,
+        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
@@ -152,6 +158,21 @@ class SwinConfig(PretrainedConfig):
        # this indicates the channel dimension after the last stage of the model
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
        if out_features is not None:
            if not isinstance(out_features, list):
                raise ValueError("out_features should be a list")
@@ -160,7 +181,15 @@ class SwinConfig(PretrainedConfig):
                    raise ValueError(
                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
        self.out_features = out_features
+        self.out_indices = out_indices
 class SwinOnnxConfig(OnnxConfig):

--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -1255,6 +1255,10 @@ class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
        self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
        # Add layer norms to hidden states of out_features