Add Wav2Vec2Conformer (#16812)

* save intermediate * add wav2vec2 conformer * add more code * more * first test passes * make all checkpoints work * update * up * more clean ups * save clean-up * save clean-up * save more * remove bogus * finalize design conformer * remove vision * finish all tests * more changes * finish code * add doc tests * add slow tests * fix autoconfig test * up * correct docstring * up * update * fix * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com> * Update docs/source/en/model_doc/wav2vec2-conformer.mdx * upload * save copied from * correct configs * fix model outputs * add to docs * fix imports * finish * finish code * correct copied from * correct again * correct make fix * improve make fix copies * save * correct fix copy from * correct init structure * correct * fix import * apply suggestions Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

Add Wav2Vec2Conformer (#16812)
* save intermediate * add wav2vec2 conformer * add more code * more * first test passes * make all checkpoints work * update * up * more clean ups * save clean-up * save clean-up * save more * remove bogus * finalize design conformer * remove vision * finish all tests * more changes * finish code * add doc tests * add slow tests * fix autoconfig test * up * correct docstring * up * update * fix * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com> * Update docs/source/en/model_doc/wav2vec2-conformer.mdx * upload * save copied from * correct configs * fix model outputs * add to docs * fix imports * finish * finish code * correct copied from * correct again * correct make fix * improve make fix copies * save * correct fix copy from * correct init structure * correct * fix import * apply suggestions Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
5a995735 · Patrick von Platen · GitHub · f0395cf5 · 5a995735 · 5a995735
Unverified Commit 5a995735 authored May 17, 2022 by Patrick von Platen Committed by GitHub May 17, 2022
15 changed files
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -27,7 +27,7 @@ from torch.nn import CrossEntropyLoss
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import torch_int_div
 from ...utils import (
@@ -71,35 +71,6 @@ UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
-@dataclass
-class UniSpeechBaseModelOutput(ModelOutput):
-    """
-    Output type of [`UniSpeechBaseModelOutput`], with potential hidden states and attentions.
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
-            Sequence of extracted feature vectors of the last convolutional layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    last_hidden_state: torch.FloatTensor = None
-    extract_features: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
 class UniSpeechForPreTrainingOutput(ModelOutput):
    """
@@ -1158,7 +1129,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
    @add_code_sample_docstrings(
        processor_class=_PROCESSOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=UniSpeechBaseModelOutput,
+        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1171,7 +1142,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, UniSpeechBaseModelOutput]:
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1203,7 +1174,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]
-        return UniSpeechBaseModelOutput(
+        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,

--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -81,13 +81,13 @@ class UniSpeechSatConfig(PretrainedConfig):
            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for quantized feature encoder states.
-        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
-        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
-        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
            *conv_dim*.
@@ -159,13 +159,13 @@ class UniSpeechSatConfig(PretrainedConfig):
            instance of [`UniSpeechSatForSequenceClassification`].
        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+        tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
-        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+        tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
-        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+        tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
        xvector_output_dim (`int`, *optional*, defaults to 512):

--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -27,7 +27,14 @@ from torch.nn import CrossEntropyLoss
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import torch_int_div
 from ...utils import (
@@ -77,35 +84,6 @@ UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
-@dataclass
-class UniSpeechSatBaseModelOutput(ModelOutput):
-    """
-    Output type of [`UniSpeechSatBaseModelOutput`], with potential hidden states and attentions.
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
-            Sequence of extracted feature vectors of the last convolutional layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    last_hidden_state: torch.FloatTensor = None
-    extract_features: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
 class UniSpeechSatForPreTrainingOutput(ModelOutput):
    """
@@ -143,38 +121,6 @@ class UniSpeechSatForPreTrainingOutput(ModelOutput):
    attentions: Optional[Tuple[torch.FloatTensor]] = None
-@dataclass
-class XVectorOutput(ModelOutput):
-    """
-    Output type of [`Wav2Vec2ForXVector`].
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
-            Classification hidden states before AMSoftmax.
-        embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
-            Utterance embeddings used for vector similarity-based retrieval.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    embeddings: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
 def _compute_mask_indices(
    shape: Tuple[int, int],
@@ -1198,7 +1144,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
    @add_code_sample_docstrings(
        processor_class=_PROCESSOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=UniSpeechSatBaseModelOutput,
+        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1211,7 +1157,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, UniSpeechSatBaseModelOutput]:
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1243,7 +1189,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]
-        return UniSpeechSatBaseModelOutput(
+        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,

--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -78,13 +78,13 @@ class Wav2Vec2Config(PretrainedConfig):
            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for quantized feature encoder states.
-        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
-        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
-        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
            *conv_dim*.
@@ -156,13 +156,13 @@ class Wav2Vec2Config(PretrainedConfig):
            instance of [`Wav2Vec2ForSequenceClassification`].
        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+        tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
-        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+        tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
-        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+        tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
        xvector_output_dim (`int`, *optional*, defaults to 512):

--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -33,6 +33,8 @@ from ...modeling_outputs import (
    MaskedLMOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import torch_int_div
@@ -88,35 +90,6 @@ WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
-@dataclass
-class Wav2Vec2BaseModelOutput(ModelOutput):
-    """
-    Output type of [`Wav2Vec2BaseModelOutput`], with potential hidden states and attentions.
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
-            Sequence of extracted feature vectors of the last convolutional layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    last_hidden_state: torch.FloatTensor = None
-    extract_features: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
 class Wav2Vec2ForPreTrainingOutput(ModelOutput):
    """
@@ -159,38 +132,6 @@ class Wav2Vec2ForPreTrainingOutput(ModelOutput):
    diversity_loss: Optional[torch.FloatTensor] = None
-@dataclass
-class XVectorOutput(ModelOutput):
-    """
-    Output type of [`Wav2Vec2ForXVector`].
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
-            Classification hidden states before AMSoftmax.
-        embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
-            Utterance embeddings used for vector similarity-based retrieval.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    embeddings: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
@@ -1025,11 +966,8 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module):
        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
        # use probs to retrieve codevectors
        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
-        codevectors = (
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
-            codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
-            .sum(-2)
-            .view(batch_size, sequence_length, -1)
-        )
        return codevectors, perplexity
@@ -1473,13 +1411,13 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
        ```python
        >>> import torch
-        >>> from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForPreTraining
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
        >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
        >>> from datasets import load_dataset
        >>> import soundfile as sf
-        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("patrickvonplaten/wav2vec2-base")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-        >>> model = Wav2Vec2ForPreTraining.from_pretrained("patrickvonplaten/wav2vec2-base")
+        >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1

--- a/src/transformers/models/wav2vec2_conformer/__init__.py
+++ b/src/transformers/models/wav2vec2_conformer/__init__.py
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+_import_structure = {
+    "configuration_wav2vec2_conformer": [
+        "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Wav2Vec2ConformerConfig",
+    ],
+}
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_wav2vec2_conformer"] = [
+        "WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Wav2Vec2ConformerForAudioFrameClassification",
+        "Wav2Vec2ConformerForCTC",
+        "Wav2Vec2ConformerForPreTraining",
+        "Wav2Vec2ConformerForSequenceClassification",
+        "Wav2Vec2ConformerForXVector",
+        "Wav2Vec2ConformerModel",
+        "Wav2Vec2ConformerPreTrainedModel",
+    ]
+if TYPE_CHECKING:
+    from .configuration_wav2vec2_conformer import (
+        WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Wav2Vec2ConformerConfig,
+    )
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_wav2vec2_conformer import (
+            WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Wav2Vec2ConformerForAudioFrameClassification,
+            Wav2Vec2ConformerForCTC,
+            Wav2Vec2ConformerForPreTraining,
+            Wav2Vec2ConformerForSequenceClassification,
+            Wav2Vec2ConformerForXVector,
+            Wav2Vec2ConformerModel,
+            Wav2Vec2ConformerPreTrainedModel,
+        )
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
--- a/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Wav2Vec2Conformer checkpoint."""
+import argparse
+import json
+import os
+import fairseq
+import torch
+from fairseq.data import Dictionary
+from transformers import (
+    Wav2Vec2ConformerConfig,
+    Wav2Vec2ConformerForCTC,
+    Wav2Vec2ConformerForPreTraining,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Processor,
+    logging,
+)
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+MAPPING = {
+    "post_extract_proj": "feature_projection.projection",
+    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
+    "self_attn.linear_k": "encoder.layers.*.self_attn.linear_k",
+    "self_attn.linear_v": "encoder.layers.*.self_attn.linear_v",
+    "self_attn.linear_q": "encoder.layers.*.self_attn.linear_q",
+    "self_attn.pos_bias_u": "encoder.layers.*.self_attn.pos_bias_u",
+    "self_attn.pos_bias_v": "encoder.layers.*.self_attn.pos_bias_v",
+    "self_attn.linear_out": "encoder.layers.*.self_attn.linear_out",
+    "self_attn.linear_pos": "encoder.layers.*.self_attn.linear_pos",
+    "self_attn.rotary_emb": "encoder.embed_positions",
+    "self_attn_layer_norm": "encoder.layers.*.self_attn_layer_norm",
+    "conv_module.pointwise_conv1": "encoder.layers.*.conv_module.pointwise_conv1",
+    "conv_module.pointwise_conv2": "encoder.layers.*.conv_module.pointwise_conv2",
+    "conv_module.depthwise_conv": "encoder.layers.*.conv_module.depthwise_conv",
+    "conv_module.batch_norm": "encoder.layers.*.conv_module.batch_norm",
+    "conv_module.layer_norm": "encoder.layers.*.conv_module.layer_norm",
+    "ffn1.w_1": "encoder.layers.*.ffn1.intermediate_dense",
+    "ffn1.w_2": "encoder.layers.*.ffn1.output_dense",
+    "ffn1.layer_norm": "encoder.layers.*.ffn1_layer_norm",
+    "ffn2.w_1": "encoder.layers.*.ffn2.intermediate_dense",
+    "ffn2.w_2": "encoder.layers.*.ffn2.output_dense",
+    "ffn2.layer_norm": "encoder.layers.*.ffn2_layer_norm",
+    "final_layer_norm": "encoder.layers.*.final_layer_norm",
+    "encoder.layer_norm": "encoder.layer_norm",
+    "w2v_model.layer_norm": "feature_projection.layer_norm",
+    "quantizer.weight_proj": "quantizer.weight_proj",
+    "quantizer.vars": "quantizer.codevectors",
+    "project_q": "project_q",
+    "final_proj": "project_hid",
+    "w2v_encoder.proj": "lm_head",
+    "mask_emb": "masked_spec_embed",
+}
+TOP_LEVEL_KEYS = [
+    "lm_head",
+    "quantizer.weight_proj",
+    "quantizer.codevectors",
+    "project_q",
+    "project_hid",
+]
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "running_mean":
+        hf_pointer.running_mean.data = value
+    elif weight_type == "running_var":
+        hf_pointer.running_var.data = value
+    elif weight_type == "num_batches_tracked":
+        hf_pointer.num_batches_tracked.data = value
+    elif weight_type == "inv_freq":
+        hf_pointer.inv_freq.data = value
+    else:
+        hf_pointer.data = value
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+def recursively_load_weights(fairseq_model, hf_model, is_headless):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+    feature_extractor = hf_model.wav2vec2_conformer.feature_extractor
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                mapped_key = "wav2vec2_conformer." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "pos_bias_u" in name:
+                        weight_type = None
+                    elif "pos_bias_v" in name:
+                        weight_type = None
+                    elif "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "bias" in name:
+                        weight_type = "bias"
+                    elif "weight" in name:
+                        # TODO: don't match quantizer.weight_proj
+                        weight_type = "weight"
+                    elif "running_mean" in name:
+                        weight_type = "running_mean"
+                    elif "inv_freq" in name:
+                        weight_type = "inv_freq"
+                    elif "running_var" in name:
+                        weight_type = "running_var"
+                    elif "num_batches_tracked" in name:
+                        weight_type = "num_batches_tracked"
+                    else:
+                        weight_type = None
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+    logger.warning(f"Unused weights: {unused_weights}")
+# Copied from transformers.models.wav2vec2.convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.load_conv_layer
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+    if type_id == 0:
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+@torch.no_grad()
+def convert_wav2vec2_conformer_checkpoint(
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = Wav2Vec2ConformerConfig.from_pretrained(config_path, hidden_act="swish")
+    else:
+        config = Wav2Vec2ConformerConfig()
+    if "rope" in checkpoint_path:
+        config.position_embeddings_type = "rotary"
+    if is_finetuned:
+        if dict_path:
+            target_dict = Dictionary.load(dict_path)
+            # important change bos & pad token id since CTC symbol is <pad> and
+            # not <s> as in fairseq
+            config.bos_token_id = target_dict.pad_index
+            config.pad_token_id = target_dict.bos_index
+            config.eos_token_id = target_dict.eos_index
+            config.vocab_size = len(target_dict.symbols)
+            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
+            if not os.path.isdir(pytorch_dump_folder_path):
+                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
+                return
+            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+            vocab_dict = target_dict.indices
+            # fairseq has the <pad> and <s> switched
+            vocab_dict["<pad>"] = 0
+            vocab_dict["<s>"] = 1
+            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
+                json.dump(vocab_dict, vocab_handle)
+            tokenizer = Wav2Vec2CTCTokenizer(
+                vocab_path,
+                unk_token=target_dict.unk_word,
+                pad_token=target_dict.pad_word,
+                bos_token=target_dict.bos_word,
+                eos_token=target_dict.eos_word,
+                word_delimiter_token="|",
+                do_lower_case=False,
+            )
+            return_attention_mask = True if config.feat_extract_norm == "layer" else False
+            feature_extractor = Wav2Vec2FeatureExtractor(
+                feature_size=1,
+                sampling_rate=16000,
+                padding_value=0,
+                do_normalize=True,
+                return_attention_mask=return_attention_mask,
+            )
+            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor.save_pretrained(pytorch_dump_folder_path)
+        hf_wav2vec = Wav2Vec2ConformerForCTC(config)
+    else:
+        hf_wav2vec = Wav2Vec2ConformerForPreTraining(config)
+    if is_finetuned:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
+        )
+    else:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
+    model = model[0].eval()
+    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
+    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+    )
+    args = parser.parse_args()
+    convert_wav2vec2_conformer_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+    )
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -77,13 +77,13 @@ class WavLMConfig(PretrainedConfig):
            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for quantized feature encoder states.
-        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
-        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
-        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
            *conv_dim*.
@@ -146,13 +146,13 @@ class WavLMConfig(PretrainedConfig):
            instance of [`WavLMForSequenceClassification`].
        classifier_proj_size (`int`, *optional*, defaults to 256):
            Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+        tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
-        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+        tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
-        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+        tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
        xvector_output_dim (`int`, *optional*, defaults to 512):

--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -16,7 +16,6 @@
 import math
 import warnings
-from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import numpy as np
@@ -28,16 +27,17 @@ from torch.nn import CrossEntropyLoss
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import torch_int_div
-from ...utils import (
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-)
 from .configuration_wavlm import WavLMConfig
@@ -80,67 +80,6 @@ WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
-@dataclass
-class WavLMBaseModelOutput(ModelOutput):
-    """
-    Output type of [`WavLMBaseModelOutput`], with potential hidden states and attentions.
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
-            Sequence of extracted feature vectors of the last convolutional layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    last_hidden_state: torch.FloatTensor = None
-    extract_features: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-@dataclass
-class XVectorOutput(ModelOutput):
-    """
-    Output type of [`Wav2Vec2ForXVector`].
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
-            Classification hidden states before AMSoftmax.
-        embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
-            Utterance embeddings used for vector similarity-based retrieval.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    embeddings: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
 def _compute_mask_indices(
    shape: Tuple[int, int],
@@ -1184,7 +1123,7 @@ WAVLM_INPUTS_DOCSTRING = r"""
    "The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
    WAVLM_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM, WavLMBaseModelOutput->Wav2Vec2BaseModelOutput
 class WavLMModel(WavLMPreTrainedModel):
    def __init__(self, config: WavLMConfig):
        super().__init__(config)
@@ -1275,7 +1214,7 @@ class WavLMModel(WavLMPreTrainedModel):
    @add_code_sample_docstrings(
        processor_class=_PROCESSOR_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=WavLMBaseModelOutput,
+        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1288,7 +1227,7 @@ class WavLMModel(WavLMPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, WavLMBaseModelOutput]:
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1325,7 +1264,7 @@ class WavLMModel(WavLMPreTrainedModel):
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]
-        return WavLMBaseModelOutput(
+        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4440,6 +4440,58 @@ class Wav2Vec2PreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class Wav2Vec2ConformerForAudioFrameClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Wav2Vec2ConformerForCTC(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Wav2Vec2ConformerForPreTraining(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Wav2Vec2ConformerForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Wav2Vec2ConformerForXVector(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Wav2Vec2ConformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Wav2Vec2ConformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = None

--- a/tests/models/wav2vec2_conformer/__init__.py
+++ b/tests/models/wav2vec2_conformer/__init__.py
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -61,6 +61,7 @@ src/transformers/models/vit/modeling_tf_vit.py
 src/transformers/models/vit_mae/modeling_vit_mae.py
 src/transformers/models/wav2vec2/modeling_wav2vec2.py
 src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
 src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
 src/transformers/models/wavlm/modeling_wavlm.py	
 src/transformers/models/yolos/modeling_yolos.py