Add FastSpeech2Conformer (#23439)

* start - docs, SpeechT5 copy and rename * add relevant code from FastSpeech2 draft, have tests pass * make it an actual conformer, demo ex. * matching inference with original repo, includes debug code * refactor nn.Sequentials, start more desc. var names * more renaming * more renaming * vocoder scratchwork * matching vocoder outputs * hifigan vocoder conversion script * convert model script, rename some config vars * replace postnet with speecht5's implementation * passing common tests, file cleanup * expand testing, add output hidden states and attention * tokenizer + passing tokenizer tests * variety of updates and tests * g2p_en pckg setup * import structure edits * docstrings and cleanup * repo consistency * deps * small cleanup * forward signature param order * address comments except for masks and labels * address comments on attention_mask and labels * address second round of comments * remove old unneeded line * address comments part 1 * address comments pt 2 * rename auto mapping * fixes for failing tests * address comments part 3 (bart-like, train loss) * make style * pass config where possible * add forward method + tests to WithHifiGan model * make style * address arg passing and generate_speech comments * address Arthur comments * address Arthur comments pt2 * lint changes * Sanchit comment * add g2p-en to doctest deps * move up self.encoder * onnx compatible tensor method * fix is symbolic * fix paper url * move models to espnet org * make style * make fix-copies * update docstring * Arthur comments * update docstring w/ new updates * add model architecture images * header size * md wording update * make style

Add FastSpeech2Conformer (#23439)
* start - docs, SpeechT5 copy and rename * add relevant code from FastSpeech2 draft, have tests pass * make it an actual conformer, demo ex. * matching inference with original repo, includes debug code * refactor nn.Sequentials, start more desc. var names * more renaming * more renaming * vocoder scratchwork * matching vocoder outputs * hifigan vocoder conversion script * convert model script, rename some config vars * replace postnet with speecht5's implementation * passing common tests, file cleanup * expand testing, add output hidden states and attention * tokenizer + passing tokenizer tests * variety of updates and tests * g2p_en pckg setup * import structure edits * docstrings and cleanup * repo consistency * deps * small cleanup * forward signature param order * address comments except for masks and labels * address comments on attention_mask and labels * address second round of comments * remove old unneeded line * address comments part 1 * address comments pt 2 * rename auto mapping * fixes for failing tests * address comments part 3 (bart-like, train loss) * make style * pass config where possible * add forward method + tests to WithHifiGan model * make style * address arg passing and generate_speech comments * address Arthur comments * address Arthur comments pt2 * lint changes * Sanchit comment * add g2p-en to doctest deps * move up self.encoder * onnx compatible tensor method * fix is symbolic * fix paper url * move models to espnet org * make style * make fix-copies * update docstring * Arthur comments * update docstring w/ new updates * add model architecture images * header size * md wording update * make style
d83ff5ee · Connor Henderson · GitHub · 6eba901d · d83ff5ee · d83ff5ee
Unverified Commit d83ff5ee authored Jan 03, 2024 by Connor Henderson Committed by GitHub Jan 03, 2024
16 changed files
--- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" FastSpeech2Conformer model configuration"""
+from typing import Dict
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/config.json",
+}
+FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "espnet/fastspeech2_conformer_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_hifigan/raw/main/config.json",
+}
+FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "espnet/fastspeech2_conformer_with_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_with_hifigan/raw/main/config.json",
+}
+class FastSpeech2ConformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to
+    instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer)
+    architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 384):
+            The dimensionality of the hidden layers.
+        vocab_size (`int`, *optional*, defaults to 78):
+            The size of the vocabulary.
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            The number of mel filters used in the filter bank.
+        encoder_num_attention_heads (`int`, *optional*, defaults to 2):
+            The number of attention heads in the encoder.
+        encoder_layers (`int`, *optional*, defaults to 4):
+            The number of layers in the encoder.
+        encoder_linear_units (`int`, *optional*, defaults to 1536):
+            The number of units in the linear layer of the encoder.
+        decoder_layers (`int`, *optional*, defaults to 4):
+            The number of layers in the decoder.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 2):
+            The number of attention heads in the decoder.
+        decoder_linear_units (`int`, *optional*, defaults to 1536):
+            The number of units in the linear layer of the decoder.
+        speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
+            The number of layers in the post-net of the speech decoder.
+        speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
+            The number of units in the post-net layers of the speech decoder.
+        speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
+            The kernel size in the post-net of the speech decoder.
+        positionwise_conv_kernel_size (`int`, *optional*, defaults to 3):
+            The size of the convolution kernel used in the position-wise layer.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Specifies whether to normalize before encoder layers.
+        decoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Specifies whether to normalize before decoder layers.
+        encoder_concat_after (`bool`, *optional*, defaults to `False`):
+            Specifies whether to concatenate after encoder layers.
+        decoder_concat_after (`bool`, *optional*, defaults to `False`):
+            Specifies whether to concatenate after decoder layers.
+        reduction_factor (`int`, *optional*, defaults to 1):
+            The factor by which the speech frame rate is reduced.
+        speaking_speed (`float`, *optional*, defaults to 1.0):
+            The speed of the speech produced.
+        use_macaron_style_in_conformer (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use macaron style in the conformer.
+        use_cnn_in_conformer (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use convolutional neural networks in the conformer.
+        encoder_kernel_size (`int`, *optional*, defaults to 7):
+            The kernel size used in the encoder.
+        decoder_kernel_size (`int`, *optional*, defaults to 31):
+            The kernel size used in the decoder.
+        duration_predictor_layers (`int`, *optional*, defaults to 2):
+            The number of layers in the duration predictor.
+        duration_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the duration predictor.
+        duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size used in the duration predictor.
+        energy_predictor_layers (`int`, *optional*, defaults to 2):
+            The number of layers in the energy predictor.
+        energy_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the energy predictor.
+        energy_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size used in the energy predictor.
+        energy_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the energy predictor.
+        energy_embed_kernel_size (`int`, *optional*, defaults to 1):
+            The kernel size used in the energy embed layer.
+        energy_embed_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout rate in the energy embed layer.
+        stop_gradient_from_energy_predictor (`bool`, *optional*, defaults to `False`):
+            Specifies whether to stop gradients from the energy predictor.
+        pitch_predictor_layers (`int`, *optional*, defaults to 5):
+            The number of layers in the pitch predictor.
+        pitch_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the pitch predictor.
+        pitch_predictor_kernel_size (`int`, *optional*, defaults to 5):
+            The kernel size used in the pitch predictor.
+        pitch_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the pitch predictor.
+        pitch_embed_kernel_size (`int`, *optional*, defaults to 1):
+            The kernel size used in the pitch embed layer.
+        pitch_embed_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout rate in the pitch embed layer.
+        stop_gradient_from_pitch_predictor (`bool`, *optional*, defaults to `True`):
+            Specifies whether to stop gradients from the pitch predictor.
+        encoder_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the encoder.
+        encoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The positional dropout rate in the encoder.
+        encoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The attention dropout rate in the encoder.
+        decoder_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the decoder.
+        decoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The positional dropout rate in the decoder.
+        decoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The attention dropout rate in the decoder.
+        duration_predictor_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the duration predictor.
+        speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the speech decoder postnet.
+        max_source_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        use_masking (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use masking in the model.
+        use_weighted_masking (`bool`, *optional*, defaults to `False`):
+            Specifies whether to use weighted masking in the model.
+        num_speakers (`int`, *optional*):
+            Number of speakers. If set to > 1, assume that the speaker ids will be provided as the input and use
+            speaker id embedding layer.
+        num_languages (`int`, *optional*):
+            Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
+            languge id embedding layer.
+        speaker_embed_dim (`int`, *optional*):
+            Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Specifies whether the model is an encoder-decoder.
+    Example:
+    ```python
+    >>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig
+    >>> # Initializing a FastSpeech2Conformer style configuration
+    >>> configuration = FastSpeech2ConformerConfig()
+    >>> # Initializing a model from the FastSpeech2Conformer style configuration
+    >>> model = FastSpeech2ConformerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "fastspeech2_conformer"
+    attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
+    def __init__(
+        self,
+        hidden_size=384,
+        vocab_size=78,
+        num_mel_bins=80,
+        encoder_num_attention_heads=2,
+        encoder_layers=4,
+        encoder_linear_units=1536,
+        decoder_layers=4,
+        decoder_num_attention_heads=2,
+        decoder_linear_units=1536,
+        speech_decoder_postnet_layers=5,
+        speech_decoder_postnet_units=256,
+        speech_decoder_postnet_kernel=5,
+        positionwise_conv_kernel_size=3,
+        encoder_normalize_before=False,
+        decoder_normalize_before=False,
+        encoder_concat_after=False,
+        decoder_concat_after=False,
+        reduction_factor=1,
+        speaking_speed=1.0,
+        use_macaron_style_in_conformer=True,
+        use_cnn_in_conformer=True,
+        encoder_kernel_size=7,
+        decoder_kernel_size=31,
+        duration_predictor_layers=2,
+        duration_predictor_channels=256,
+        duration_predictor_kernel_size=3,
+        energy_predictor_layers=2,
+        energy_predictor_channels=256,
+        energy_predictor_kernel_size=3,
+        energy_predictor_dropout=0.5,
+        energy_embed_kernel_size=1,
+        energy_embed_dropout=0.0,
+        stop_gradient_from_energy_predictor=False,
+        pitch_predictor_layers=5,
+        pitch_predictor_channels=256,
+        pitch_predictor_kernel_size=5,
+        pitch_predictor_dropout=0.5,
+        pitch_embed_kernel_size=1,
+        pitch_embed_dropout=0.0,
+        stop_gradient_from_pitch_predictor=True,
+        encoder_dropout_rate=0.2,
+        encoder_positional_dropout_rate=0.2,
+        encoder_attention_dropout_rate=0.2,
+        decoder_dropout_rate=0.2,
+        decoder_positional_dropout_rate=0.2,
+        decoder_attention_dropout_rate=0.2,
+        duration_predictor_dropout_rate=0.2,
+        speech_decoder_postnet_dropout=0.5,
+        max_source_positions=5000,
+        use_masking=True,
+        use_weighted_masking=False,
+        num_speakers=None,
+        num_languages=None,
+        speaker_embed_dim=None,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        if positionwise_conv_kernel_size % 2 == 0:
+            raise ValueError(
+                f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead."
+            )
+        if encoder_kernel_size % 2 == 0:
+            raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.")
+        if decoder_kernel_size % 2 == 0:
+            raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.")
+        if duration_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead."
+            )
+        if energy_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead."
+            )
+        if energy_embed_kernel_size % 2 == 0:
+            raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.")
+        if pitch_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead."
+            )
+        if pitch_embed_kernel_size % 2 == 0:
+            raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.")
+        if hidden_size % encoder_num_attention_heads != 0:
+            raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.")
+        if hidden_size % decoder_num_attention_heads != 0:
+            raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.")
+        if use_masking and use_weighted_masking:
+            raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.encoder_config = {
+            "num_attention_heads": encoder_num_attention_heads,
+            "layers": encoder_layers,
+            "kernel_size": encoder_kernel_size,
+            "attention_dropout_rate": encoder_attention_dropout_rate,
+            "dropout_rate": encoder_dropout_rate,
+            "positional_dropout_rate": encoder_positional_dropout_rate,
+            "linear_units": encoder_linear_units,
+            "normalize_before": encoder_normalize_before,
+            "concat_after": encoder_concat_after,
+        }
+        self.decoder_config = {
+            "num_attention_heads": decoder_num_attention_heads,
+            "layers": decoder_layers,
+            "kernel_size": decoder_kernel_size,
+            "attention_dropout_rate": decoder_attention_dropout_rate,
+            "dropout_rate": decoder_dropout_rate,
+            "positional_dropout_rate": decoder_positional_dropout_rate,
+            "linear_units": decoder_linear_units,
+            "normalize_before": decoder_normalize_before,
+            "concat_after": decoder_concat_after,
+        }
+        self.encoder_num_attention_heads = encoder_num_attention_heads
+        self.encoder_layers = encoder_layers
+        self.duration_predictor_channels = duration_predictor_channels
+        self.duration_predictor_kernel_size = duration_predictor_kernel_size
+        self.duration_predictor_layers = duration_predictor_layers
+        self.energy_embed_dropout = energy_embed_dropout
+        self.energy_embed_kernel_size = energy_embed_kernel_size
+        self.energy_predictor_channels = energy_predictor_channels
+        self.energy_predictor_dropout = energy_predictor_dropout
+        self.energy_predictor_kernel_size = energy_predictor_kernel_size
+        self.energy_predictor_layers = energy_predictor_layers
+        self.pitch_embed_dropout = pitch_embed_dropout
+        self.pitch_embed_kernel_size = pitch_embed_kernel_size
+        self.pitch_predictor_channels = pitch_predictor_channels
+        self.pitch_predictor_dropout = pitch_predictor_dropout
+        self.pitch_predictor_kernel_size = pitch_predictor_kernel_size
+        self.pitch_predictor_layers = pitch_predictor_layers
+        self.positionwise_conv_kernel_size = positionwise_conv_kernel_size
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout
+        self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.reduction_factor = reduction_factor
+        self.speaking_speed = speaking_speed
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.max_source_positions = max_source_positions
+        self.use_cnn_in_conformer = use_cnn_in_conformer
+        self.use_macaron_style_in_conformer = use_macaron_style_in_conformer
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+        self.num_speakers = num_speakers
+        self.num_languages = num_languages
+        self.speaker_embed_dim = speaker_embed_dim
+        self.duration_predictor_dropout_rate = duration_predictor_dropout_rate
+        self.is_encoder_decoder = is_encoder_decoder
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to
+    instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2Conformer
+    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        model_in_dim (`int`, *optional*, defaults to 80):
+            The number of frequency bins in the input log-mel spectrogram.
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the upsampling network.
+        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*.
+        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
+            *upsample_rates*.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
+            fusion (MRF) module.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            multi-receptive field fusion (MRF) module.
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation.
+        normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
+    Example:
+    ```python
+    >>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig
+    >>> # Initializing a FastSpeech2ConformerHifiGan configuration
+    >>> configuration = FastSpeech2ConformerHifiGanConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = FastSpeech2ConformerHifiGan(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "hifigan"
+    def __init__(
+        self,
+        model_in_dim=80,
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        initializer_range=0.01,
+        leaky_relu_slope=0.1,
+        normalize_before=True,
+        **kwargs,
+    ):
+        self.model_in_dim = model_in_dim
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.initializer_range = initializer_range
+        self.leaky_relu_slope = leaky_relu_slope
+        self.normalize_before = normalize_before
+        super().__init__(**kwargs)
+class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to
+    instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations,
+    defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and
+    FastSpeech2ConformerHifiGan
+    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        model_config (`typing.Dict`, *optional*):
+            Configuration of the text-to-speech model.
+        vocoder_config (`typing.Dict`, *optional*):
+            Configuration of the vocoder model.
+    model_config ([`FastSpeech2ConformerConfig`], *optional*):
+        Configuration of the text-to-speech model.
+    vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*):
+        Configuration of the vocoder model.
+    Example:
+    ```python
+    >>> from transformers import (
+    ...     FastSpeech2ConformerConfig,
+    ...     FastSpeech2ConformerHifiGanConfig,
+    ...     FastSpeech2ConformerWithHifiGanConfig,
+    ...     FastSpeech2ConformerWithHifiGan,
+    ... )
+    >>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations.
+    >>> model_config = FastSpeech2ConformerConfig()
+    >>> vocoder_config = FastSpeech2ConformerHifiGanConfig()
+    >>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration
+    >>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict())
+    >>> # Initializing a model (with random weights)
+    >>> model = FastSpeech2ConformerWithHifiGan(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "fastspeech2_conformer_with_hifigan"
+    is_composition = True
+    def __init__(
+        self,
+        model_config: Dict = None,
+        vocoder_config: Dict = None,
+        **kwargs,
+    ):
+        if model_config is None:
+            model_config = {}
+            logger.info("model_config is None. initializing the model with default values.")
+        if vocoder_config is None:
+            vocoder_config = {}
+            logger.info("vocoder_config is None. initializing the coarse model with default values.")
+        self.model_config = FastSpeech2ConformerConfig(**model_config)
+        self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config)
+        super().__init__(**kwargs)
--- a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert FastSpeech2Conformer checkpoint."""
+import argparse
+import json
+import re
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import torch
+import yaml
+from transformers import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerModel,
+    FastSpeech2ConformerTokenizer,
+    logging,
+)
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
+CONFIG_MAPPING = {
+    "adim": "hidden_size",
+    "aheads": "num_attention_heads",
+    "conformer_dec_kernel_size": "decoder_kernel_size",
+    "conformer_enc_kernel_size": "encoder_kernel_size",
+    "decoder_normalize_before": "decoder_normalize_before",
+    "dlayers": "decoder_layers",
+    "dunits": "decoder_linear_units",
+    "duration_predictor_chans": "duration_predictor_channels",
+    "duration_predictor_kernel_size": "duration_predictor_kernel_size",
+    "duration_predictor_layers": "duration_predictor_layers",
+    "elayers": "encoder_layers",
+    "encoder_normalize_before": "encoder_normalize_before",
+    "energy_embed_dropout": "energy_embed_dropout",
+    "energy_embed_kernel_size": "energy_embed_kernel_size",
+    "energy_predictor_chans": "energy_predictor_channels",
+    "energy_predictor_dropout": "energy_predictor_dropout",
+    "energy_predictor_kernel_size": "energy_predictor_kernel_size",
+    "energy_predictor_layers": "energy_predictor_layers",
+    "eunits": "encoder_linear_units",
+    "pitch_embed_dropout": "pitch_embed_dropout",
+    "pitch_embed_kernel_size": "pitch_embed_kernel_size",
+    "pitch_predictor_chans": "pitch_predictor_channels",
+    "pitch_predictor_dropout": "pitch_predictor_dropout",
+    "pitch_predictor_kernel_size": "pitch_predictor_kernel_size",
+    "pitch_predictor_layers": "pitch_predictor_layers",
+    "positionwise_conv_kernel_size": "positionwise_conv_kernel_size",
+    "postnet_chans": "speech_decoder_postnet_units",
+    "postnet_filts": "speech_decoder_postnet_kernel",
+    "postnet_layers": "speech_decoder_postnet_layers",
+    "reduction_factor": "reduction_factor",
+    "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor",
+    "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor",
+    "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate",
+    "transformer_dec_dropout_rate": "decoder_dropout_rate",
+    "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate",
+    "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate",
+    "transformer_enc_dropout_rate": "encoder_dropout_rate",
+    "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate",
+    "use_cnn_in_conformer": "use_cnn_in_conformer",
+    "use_macaron_style_in_conformer": "use_macaron_style_in_conformer",
+    "use_masking": "use_masking",
+    "use_weighted_masking": "use_weighted_masking",
+    "idim": "input_dim",
+    "odim": "num_mel_bins",
+    "spk_embed_dim": "speaker_embed_dim",
+    "langs": "num_languages",
+    "spks": "num_speakers",
+}
+def remap_model_yaml_config(yaml_config_path):
+    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
+        args = yaml.safe_load(f)
+        args = argparse.Namespace(**args)
+    remapped_config = {}
+    model_params = args.tts_conf["text2mel_params"]
+    # espnet_config_key -> hf_config_key, any keys not included are ignored
+    for espnet_config_key, hf_config_key in CONFIG_MAPPING.items():
+        if espnet_config_key in model_params:
+            remapped_config[hf_config_key] = model_params[espnet_config_key]
+    return remapped_config, args.g2p, args.token_list
+def convert_espnet_state_dict_to_hf(state_dict):
+    new_state_dict = {}
+    for key in state_dict:
+        if "tts.generator.text2mel." in key:
+            new_key = key.replace("tts.generator.text2mel.", "")
+            if "postnet" in key:
+                new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers")
+                new_key = new_key.replace(".0.weight", ".conv.weight")
+                new_key = new_key.replace(".1.weight", ".batch_norm.weight")
+                new_key = new_key.replace(".1.bias", ".batch_norm.bias")
+                new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean")
+                new_key = new_key.replace(".1.running_var", ".batch_norm.running_var")
+                new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked")
+            if "feat_out" in key:
+                if "weight" in key:
+                    new_key = "speech_decoder_postnet.feat_out.weight"
+                if "bias" in key:
+                    new_key = "speech_decoder_postnet.feat_out.bias"
+            if "encoder.embed.0.weight" in key:
+                new_key = new_key.replace("0.", "")
+            if "w_1" in key:
+                new_key = new_key.replace("w_1", "conv1")
+            if "w_2" in key:
+                new_key = new_key.replace("w_2", "conv2")
+            if "predictor.conv" in key:
+                new_key = new_key.replace(".conv", ".conv_layers")
+                pattern = r"(\d)\.(\d)"
+                replacement = (
+                    r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm"
+                )
+                new_key = re.sub(pattern, replacement, new_key)
+            if "pitch_embed" in key or "energy_embed" in key:
+                new_key = new_key.replace("0", "conv")
+            if "encoders" in key:
+                new_key = new_key.replace("encoders", "conformer_layers")
+                new_key = new_key.replace("norm_final", "final_layer_norm")
+                new_key = new_key.replace("norm_mha", "self_attn_layer_norm")
+                new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm")
+                new_key = new_key.replace("norm_ff", "ff_layer_norm")
+                new_key = new_key.replace("norm_conv", "conv_layer_norm")
+            if "lid_emb" in key:
+                new_key = new_key.replace("lid_emb", "language_id_embedding")
+            if "sid_emb" in key:
+                new_key = new_key.replace("sid_emb", "speaker_id_embedding")
+            new_state_dict[new_key] = state_dict[key]
+    return new_state_dict
+@torch.no_grad()
+def convert_FastSpeech2ConformerModel_checkpoint(
+    checkpoint_path,
+    yaml_config_path,
+    pytorch_dump_folder_path,
+    repo_id=None,
+):
+    model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path)
+    config = FastSpeech2ConformerConfig(**model_params)
+    # Prepare the model
+    model = FastSpeech2ConformerModel(config)
+    espnet_checkpoint = torch.load(checkpoint_path)
+    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
+    model.load_state_dict(hf_compatible_state_dict)
+    model.save_pretrained(pytorch_dump_folder_path)
+    # Prepare the tokenizer
+    with TemporaryDirectory() as tempdir:
+        vocab = {token: id for id, token in enumerate(vocab)}
+        vocab_file = Path(tempdir) / "vocab.json"
+        with open(vocab_file, "w") as f:
+            json.dump(vocab, f)
+        should_strip_spaces = "no_space" in tokenizer_name
+        tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces)
+    tokenizer.save_pretrained(pytorch_dump_folder_path)
+    if repo_id:
+        print("Pushing to the hub...")
+        model.push_to_hub(repo_id)
+        tokenizer.push_to_hub(repo_id)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument(
+        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_FastSpeech2ConformerModel_checkpoint(
+        args.checkpoint_path,
+        args.yaml_config_path,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )
--- a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py
+++ b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert FastSpeech2Conformer HiFi-GAN checkpoint."""
+import argparse
+from pathlib import Path
+import torch
+import yaml
+from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
+def load_weights(checkpoint, hf_model, config):
+    vocoder_key_prefix = "tts.generator.vocoder."
+    checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k}
+    hf_model.apply_weight_norm()
+    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
+    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
+    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
+    for i in range(len(config.upsample_rates)):
+        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
+        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
+        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
+    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
+        for j in range(len(config.resblock_dilation_sizes)):
+            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
+            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
+            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
+            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
+            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
+            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
+    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
+    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
+    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
+    hf_model.remove_weight_norm()
+def remap_hifigan_yaml_config(yaml_config_path):
+    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
+        args = yaml.safe_load(f)
+        args = argparse.Namespace(**args)
+    vocoder_type = args.tts_conf["vocoder_type"]
+    if vocoder_type != "hifigan_generator":
+        raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}")
+    remapped_dict = {}
+    vocoder_params = args.tts_conf["vocoder_params"]
+    # espnet_config_key -> hf_config_key
+    key_mappings = {
+        "channels": "upsample_initial_channel",
+        "in_channels": "model_in_dim",
+        "resblock_dilations": "resblock_dilation_sizes",
+        "resblock_kernel_sizes": "resblock_kernel_sizes",
+        "upsample_kernel_sizes": "upsample_kernel_sizes",
+        "upsample_scales": "upsample_rates",
+    }
+    for espnet_config_key, hf_config_key in key_mappings.items():
+        remapped_dict[hf_config_key] = vocoder_params[espnet_config_key]
+    remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"]
+    remapped_dict["normalize_before"] = False
+    remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"]
+    return remapped_dict
+@torch.no_grad()
+def convert_hifigan_checkpoint(
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    yaml_config_path=None,
+    repo_id=None,
+):
+    if yaml_config_path is not None:
+        config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
+        config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
+    else:
+        config = FastSpeech2ConformerHifiGanConfig()
+    model = FastSpeech2ConformerHifiGan(config)
+    orig_checkpoint = torch.load(checkpoint_path)
+    load_weights(orig_checkpoint, model, config)
+    model.save_pretrained(pytorch_dump_folder_path)
+    if repo_id:
+        print("Pushing to the hub...")
+        model.push_to_hub(repo_id)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_hifigan_checkpoint(
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.yaml_config_path,
+        args.push_to_hub,
+    )
--- a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py
+++ b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert FastSpeech2Conformer checkpoint."""
+import argparse
+import torch
+from transformers import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerHifiGan,
+    FastSpeech2ConformerHifiGanConfig,
+    FastSpeech2ConformerModel,
+    FastSpeech2ConformerWithHifiGan,
+    FastSpeech2ConformerWithHifiGanConfig,
+    logging,
+)
+from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import (
+    convert_espnet_state_dict_to_hf,
+    remap_model_yaml_config,
+)
+from .convert_hifigan import load_weights, remap_hifigan_yaml_config
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
+def convert_FastSpeech2ConformerWithHifiGan_checkpoint(
+    checkpoint_path,
+    yaml_config_path,
+    pytorch_dump_folder_path,
+    repo_id=None,
+):
+    # Prepare the model
+    model_params, *_ = remap_model_yaml_config(yaml_config_path)
+    model_config = FastSpeech2ConformerConfig(**model_params)
+    model = FastSpeech2ConformerModel(model_config)
+    espnet_checkpoint = torch.load(checkpoint_path)
+    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
+    model.load_state_dict(hf_compatible_state_dict)
+    # Prepare the vocoder
+    config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
+    vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
+    vocoder = FastSpeech2ConformerHifiGan(vocoder_config)
+    load_weights(espnet_checkpoint, vocoder, vocoder_config)
+    # Prepare the model + vocoder
+    config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config)
+    with_hifigan_model = FastSpeech2ConformerWithHifiGan(config)
+    with_hifigan_model.model = model
+    with_hifigan_model.vocoder = vocoder
+    with_hifigan_model.save_pretrained(pytorch_dump_folder_path)
+    if repo_id:
+        print("Pushing to the hub...")
+        with_hifigan_model.push_to_hub(repo_id)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument(
+        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        required=True,
+        default=None,
+        type=str,
+        help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_FastSpeech2ConformerWithHifiGan_checkpoint(
+        args.checkpoint_path,
+        args.yaml_config_path,
+        args.pytorch_dump_folder_path,
+        args.push_to_hub,
+    )
--- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+# coding=utf-8
+# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch FastSpeech2Conformer model."""
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, add_start_docstrings, logging, replace_return_docstrings
+from .configuration_fastspeech2_conformer import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerHifiGanConfig,
+    FastSpeech2ConformerWithHifiGanConfig,
+)
+logger = logging.get_logger(__name__)
+FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "espnet/fastspeech2_conformer",
+    # See all FastSpeech2Conformer models at https://huggingface.co/models?filter=fastspeech2_conformer
+]
+@dataclass
+class FastSpeech2ConformerModelOutput(ModelOutput):
+    """
+    Output type of [`FastSpeech2ConformerModel`].
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Spectrogram generation loss.
+        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The predicted spectrogram.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
+            Outputs of the duration predictor.
+        pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the pitch predictor.
+        energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the energy predictor.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    spectrogram: torch.FloatTensor = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    duration_outputs: torch.LongTensor = None
+    pitch_outputs: torch.FloatTensor = None
+    energy_outputs: torch.FloatTensor = None
+@dataclass
+class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
+    """
+    Output type of [`FastSpeech2ConformerWithHifiGan`].
+    Args:
+        waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
+            Speech output as a result of passing the predicted mel spectrogram through the vocoder.
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Spectrogram generation loss.
+        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The predicted spectrogram.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
+            Outputs of the duration predictor.
+        pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the pitch predictor.
+        energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+            Outputs of the energy predictor.
+    """
+    waveform: torch.FloatTensor = None
+_CONFIG_FOR_DOC = "FastSpeech2ConformerConfig"
+FASTSPEECH2_CONFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FastSpeech2ConformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+HIFIGAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FastSpeech2ConformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`FastSpeech2ConformerWithHifiGanConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0):
+    """
+    Length regulator for feed-forward Transformer.
+    This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech`
+    https://arxiv.org/pdf/1905.09263.pdf. The length regulator expands char or phoneme-level embedding features to
+    frame-level by repeating each feature based on the corresponding predicted durations.
+    Args:
+        encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`):
+            Batch of sequences of char or phoneme embeddings.
+        duration_labels (`torch.LongTensor` of shape `(batch_size, time)`):
+            Batch of durations of each frame.
+        speaking_speed (`float`, *optional*, defaults to 1.0):
+            Value to control speed of speech.
+    Returns:
+        `torch.Tensor`:
+            Replicated input tensor based on durations (batch_size, time*, embedding_dim).
+    """
+    if speaking_speed <= 0:
+        raise ValueError("`speaking_speed` must be greater than 0.")
+    elif speaking_speed != 1.0:
+        duration_labels = torch.round(duration_labels.float() * speaking_speed).long()
+    if duration_labels.sum() == 0:
+        duration_labels[duration_labels.sum(dim=1).eq(0)] = 1
+    # Calculate the maximum length needed
+    max_len = torch.sum(duration_labels, dim=1).max()
+    # Create a padded tensor to hold the results
+    hidden_states = torch.zeros(
+        (encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)),
+        dtype=torch.float,
+        device=encoded_embeddings.device,
+    )
+    # Loop through the batch and fill in the data
+    for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)):
+        repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0)
+        hidden_states[i, : repeated.size(0)] = repeated
+    return hidden_states
+class FastSpeech2ConformerDurationPredictor(nn.Module):
+    """
+    Duration predictor module.
+    This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to
+    Speech' https://arxiv.org/pdf/1905.09263.pdf The duration predictor predicts a duration of each frame in log domain
+    from the hidden embeddings of encoder.
+    Note:
+        The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the
+        outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__()
+        self.conv_layers = nn.ModuleList()
+        self.log_domain_offset = 1.0
+        for layer_idx in range(config.duration_predictor_layers):
+            num_chans = config.duration_predictor_channels
+            input_channels = config.hidden_size if layer_idx == 0 else num_chans
+            layer = FastSpeech2ConformerPredictorLayer(
+                input_channels,
+                num_chans,
+                config.duration_predictor_kernel_size,
+                config.duration_predictor_dropout_rate,
+            )
+            self.conv_layers.append(layer)
+        self.linear = nn.Linear(config.duration_predictor_channels, 1)
+    def forward(self, encoder_hidden_states):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
+                Batch of input sequences.
+            padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
+                Batch of masks indicating padded part.
+        Returns:
+            `torch.Tensor`: Batch of predicted durations in log domain `(batch_size, max_text_length)`.
+        """
+        # (batch_size, input_dim, max_text_length)
+        hidden_states = encoder_hidden_states.transpose(1, -1)
+        for layer in self.conv_layers:
+            hidden_states = layer(hidden_states)
+        # NOTE: calculate in log domain, (batch_size, max_text_length)
+        hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1)
+        if not self.training:
+            # NOTE: calculate in linear domain
+            hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long()
+        return hidden_states
+# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer
+class FastSpeech2ConformerBatchNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        if layer_id == 0:
+            in_conv_dim = config.num_mel_bins
+        else:
+            in_conv_dim = config.speech_decoder_postnet_units
+        if layer_id == config.speech_decoder_postnet_layers - 1:
+            out_conv_dim = config.num_mel_bins
+        else:
+            out_conv_dim = config.speech_decoder_postnet_units
+        self.conv = nn.Conv1d(
+            in_conv_dim,
+            out_conv_dim,
+            kernel_size=config.speech_decoder_postnet_kernel,
+            stride=1,
+            padding=(config.speech_decoder_postnet_kernel - 1) // 2,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(out_conv_dim)
+        if layer_id < config.speech_decoder_postnet_layers - 1:
+            self.activation = nn.Tanh()
+        else:
+            self.activation = None
+        self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        if self.activation is not None:
+            hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
+        self.layers = nn.ModuleList(
+            [FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
+        )
+    def forward(self, hidden_states: torch.Tensor):
+        outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
+        layer_output = outputs_before_postnet.transpose(1, 2)
+        for layer in self.layers:
+            layer_output = layer(layer_output)
+        outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2)
+        return outputs_before_postnet, outputs_after_postnet
+class FastSpeech2ConformerPredictorLayer(nn.Module):
+    def __init__(self, input_channels, num_chans, kernel_size, dropout_rate):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            input_channels,
+            num_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.activation = nn.ReLU()
+        self.layer_norm = nn.LayerNorm(num_chans)
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # Perform layer norm on dimension 1
+        hidden_states = hidden_states.transpose(1, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, -1)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class FastSpeech2ConformerVariancePredictor(nn.Module):
+    def __init__(
+        self,
+        config: FastSpeech2ConformerConfig,
+        num_layers=2,
+        num_chans=384,
+        kernel_size=3,
+        dropout_rate=0.5,
+    ):
+        """
+        Initilize variance predictor module.
+        Args:
+            input_dim (`int`): Input dimension.
+            num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers.
+            num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers.
+            kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers.
+            dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate.
+        """
+        super().__init__()
+        self.conv_layers = nn.ModuleList()
+        for idx in range(num_layers):
+            input_channels = config.hidden_size if idx == 0 else num_chans
+            layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate)
+            self.conv_layers.append(layer)
+        self.linear = nn.Linear(num_chans, 1)
+    def forward(self, encoder_hidden_states, padding_masks=None):
+        """
+        Calculate forward propagation.
+        Args:
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
+                Batch of input sequences.
+            padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
+                Batch of masks indicating padded part.
+        Returns:
+            Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`.
+        """
+        # (batch_size, input_dim, max_text_length)
+        hidden_states = encoder_hidden_states.transpose(1, -1)
+        for layer in self.conv_layers:
+            hidden_states = layer(hidden_states)
+        hidden_states = self.linear(hidden_states.transpose(1, 2))
+        if padding_masks is not None:
+            hidden_states = hidden_states.masked_fill(padding_masks, 0.0)
+        return hidden_states
+class FastSpeech2ConformerVarianceEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=384,
+        kernel_size=1,
+        padding=0,
+        dropout_rate=0.0,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+        )
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+class FastSpeech2ConformerAttention(nn.Module):
+    """
+    Multi-Head attention layer with relative position encoding. Details can be found in
+    https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """Construct an FastSpeech2ConformerAttention object."""
+        super().__init__()
+        # We assume d_v always equals dim_key
+        self.num_heads = module_config["num_attention_heads"]
+        self.hidden_size = config.hidden_size
+        self.dim_key = self.hidden_size // self.num_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.linear_q = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_k = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_v = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_out = nn.Linear(self.hidden_size, self.hidden_size)
+        self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"])
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+    def shift_relative_position_tensor(self, pos_tensor):
+        """
+        Args:
+            pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor.
+        """
+        zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype)
+        pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1)
+        pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2))
+        # only keep the positions from 0 to time2
+        pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1]
+        return pos_tensor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        pos_emb: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time2, size)`): Values of the hidden states
+            attention_mask (`torch.Tensor` of shape `(batch, time1, time2)`): Mask tensor.
+            pos_emb (`torch.Tensor` of shape `(batch, 2*time1-1, size)`): Positional embedding tensor.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time1, d_model)`.
+        """
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.linear_q(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        key_states = self.linear_k(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        value_states = self.linear_v(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        bsz_pos = pos_emb.size(0)
+        pos_encoding = self.linear_pos(pos_emb).view(bsz_pos, -1, self.num_heads, self.head_dim)
+        # (batch_size, head, time1, dim_key)
+        query_with_bias_u = (query_states + self.pos_bias_u).transpose(1, 2)
+        # (batch_size, head, time1, dim_key)
+        query_with_bias_v = (query_states + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch_size, head, time1, time2)
+        matrix_ac = torch.matmul(query_with_bias_u, key_states.permute(0, 2, 3, 1))
+        # compute matrix b and matrix d
+        # (batch_size, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(query_with_bias_v, pos_encoding.permute(0, 2, 3, 1))
+        matrix_bd = self.shift_relative_position_tensor(matrix_bd)
+        # (batch_size, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.dim_key)
+        # Forward attention
+        if attention_mask is not None:
+            expected_size = (bsz, 1, q_len)
+            if attention_mask.size() != expected_size:
+                raise ValueError(f"Attention mask should be of size {expected_size}, but is {attention_mask.size()}")
+            attention_mask = attention_mask.unsqueeze(1).eq(0)
+            min_value = float(torch.finfo(scores.dtype).min)
+            scores = scores.masked_fill(attention_mask, min_value)
+            attn_weights = torch.softmax(scores, dim=-1).masked_fill(attention_mask, 0.0)
+        else:
+            attn_weights = torch.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        attn_output = torch.matmul(attn_weights, value_states.transpose(1, 2))
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1)
+        attn_output = self.linear_out(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+class FastSpeech2ConformerConvolutionModule(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        super().__init__()
+        # kernel_size should be an odd number for 'SAME' padding
+        channels = config.hidden_size
+        kernel_size = module_config["kernel_size"]
+        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
+        self.depthwise_conv = nn.Conv1d(
+            channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)
+    def forward(self, hidden_states):
+        """
+        Compute convolution module.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
+        """
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+        # GLU mechanism, (batch_size, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # (batch_size, channel, dim)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states * torch.sigmoid(hidden_states)
+        hidden_states = self.pointwise_conv2(hidden_states)
+        return hidden_states.transpose(1, 2)
+class FastSpeech2ConformerEncoderLayer(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        super().__init__()
+        # self-attention module definition
+        self.self_attn = FastSpeech2ConformerAttention(config, module_config)
+        # feed-forward module definition
+        self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
+        self.macaron_style = config.use_macaron_style_in_conformer
+        if self.macaron_style:
+            self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
+            self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        # convolution module definition
+        self.use_cnn_module = config.use_cnn_in_conformer
+        if self.use_cnn_module:
+            self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config)
+            self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
+            self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ff_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(module_config["dropout_rate"])
+        self.size = config.hidden_size
+        self.normalize_before = module_config["normalize_before"]
+        self.concat_after = module_config["concat_after"]
+        if self.concat_after:
+            self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pos_emb: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ):
+        """
+        Compute encoded features.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time, size)`): Input tensor.
+            pos_emb (`torch.Tensor` of shape `(1, time, size)`): Positional embeddings tensor.
+            attention_mask (`torch.Tensor` of shape `(batch, time)`): Attention mask tensor for the input.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time, size)`.
+        """
+        # whether to use macaron style
+        if self.macaron_style:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.ff_macaron_layer_norm(hidden_states)
+            hidden_states = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(hidden_states))
+            if not self.normalize_before:
+                hidden_states = self.ff_macaron_layer_norm(hidden_states)
+        # multi-headed self-attention module
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        attention_output, attention_scores = self.self_attn(
+            hidden_states, attention_mask=attention_mask, pos_emb=pos_emb, output_attentions=output_attentions
+        )
+        if self.concat_after:
+            x_concat = torch.cat((hidden_states, attention_output), dim=-1)
+            hidden_states = self.concat_linear(x_concat)
+            hidden_states = residual + hidden_states
+        else:
+            hidden_states = self.dropout(attention_output)
+            hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        # convolution module
+        if self.use_cnn_module:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.conv_layer_norm(hidden_states)
+            hidden_states = self.conv_module(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = residual + hidden_states
+            if not self.normalize_before:
+                hidden_states = self.conv_layer_norm(hidden_states)
+        # feed forward module
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.ff_layer_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + self.ff_scale * hidden_states
+        if not self.normalize_before:
+            hidden_states = self.ff_layer_norm(hidden_states)
+        if self.conv_module is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attention_scores,)
+        return outputs
+class FastSpeech2ConformerMultiLayeredConv1d(nn.Module):
+    """
+    Multi-layered conv1d for Transformer block.
+    This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer
+    block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech'
+    https://arxiv.org/pdf/1905.09263.pdf
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """
+        Initialize FastSpeech2ConformerMultiLayeredConv1d module.
+        Args:
+            input_channels (`int`): Number of input channels.
+            hidden_channels (`int`): Number of hidden channels.
+            kernel_size (`int`): Kernel size of conv1d.
+            dropout_rate (`float`): Dropout rate.
+        """
+        super().__init__()
+        input_channels = config.hidden_size
+        hidden_channels = module_config["linear_units"]
+        kernel_size = config.positionwise_conv_kernel_size
+        self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.dropout = nn.Dropout(module_config["dropout_rate"])
+    def forward(self, hidden_states):
+        """
+        Calculate forward propagation.
+        Args:
+            hidden_states (torch.Tensor): Batch of input tensors (batch_size, time, input_channels).
+        Returns:
+            torch.Tensor: Batch of output tensors (batch_size, time, hidden_channels).
+        """
+        hidden_states = hidden_states.transpose(-1, 1)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = torch.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = hidden_states.transpose(-1, 1)
+        return hidden_states
+class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
+    """
+    Args:
+    Relative positional encoding module (new implementation). Details can be found in
+    https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://arxiv.org/abs/1901.02860
+        config (`FastSpeech2ConformerConfig`):
+            FastSpeech2ConformerConfig instance.
+        module_config (`dict`):
+            Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """
+        Construct an PositionalEncoding object.
+        """
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.input_scale = math.sqrt(self.embed_dim)
+        self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"])
+        self.pos_enc = None
+        self.max_len = 5000
+        self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len))
+    def extend_pos_enc(self, x):
+        """Reset the positional encodings."""
+        if self.pos_enc is not None:
+            # self.pos_enc contains both positive and negative parts
+            # the length of self.pos_enc is 2 * input_len - 1
+            if self.pos_enc.size(1) >= x.size(1) * 2 - 1:
+                if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device:
+                    self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vector and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pos_enc_positive = torch.zeros(x.size(1), self.embed_dim)
+        pos_enc_negative = torch.zeros(x.size(1), self.embed_dim)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.embed_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embed_dim)
+        )
+        pos_enc_positive[:, 0::2] = torch.sin(position * div_term)
+        pos_enc_positive[:, 1::2] = torch.cos(position * div_term)
+        pos_enc_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pos_enc_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pos_enc_positive = torch.flip(pos_enc_positive, [0]).unsqueeze(0)
+        pos_enc_negative = pos_enc_negative[1:].unsqueeze(0)
+        pos_enc = torch.cat([pos_enc_positive, pos_enc_negative], dim=1)
+        self.pos_enc = pos_enc.to(device=x.device, dtype=x.dtype)
+    def forward(self, feature_representation):
+        """
+        Args:
+            feature_representation (`torch.Tensor` of shape (batch_size, time, `*`)):
+                Input tensor.
+        Returns:
+            `torch.Tensor`: Encoded tensor (batch_size, time, `*`).
+        """
+        self.extend_pos_enc(feature_representation)
+        hidden_states = feature_representation * self.input_scale
+        center_idx = self.pos_enc.size(1) // 2
+        pos_emb = self.pos_enc[:, center_idx - hidden_states.size(1) + 1 : center_idx + hidden_states.size(1)]
+        return self.dropout(hidden_states), self.dropout(pos_emb)
+class FastSpeech2ConformerEncoder(nn.Module):
+    """
+    FastSpeech2ConformerEncoder encoder module.
+    Args:
+        config (`FastSpeech2ConformerConfig`):
+            FastSpeech2ConformerConfig instance.
+        module_config (`dict`):
+            Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
+        use_encoder_input_layer (`bool`, *optional*, defaults to `False`):
+            Input layer type.
+    """
+    def __init__(
+        self,
+        config: FastSpeech2ConformerConfig,
+        module_config,
+        use_encoder_input_layer=False,
+    ):
+        super().__init__()
+        self.embed = None
+        if use_encoder_input_layer:
+            self.embed = nn.Embedding(
+                num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=0
+            )
+        self.pos_enc = FastSpeech2ConformerRelPositionalEncoding(config, module_config)
+        self.conformer_layers = nn.ModuleList(
+            [FastSpeech2ConformerEncoderLayer(config, module_config) for _ in range(module_config["layers"])]
+        )
+    def forward(
+        self,
+        input_tensor: torch.LongTensor,
+        attention_mask: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+            `torch.Tensor`:
+                Output tensor of shape `(batch, time, attention_dim)`.
+        """
+        feature_representation = input_tensor
+        if self.embed is not None:
+            feature_representation = self.embed(feature_representation)
+        hidden_states, pos_emb = self.pos_enc(feature_representation)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for conformer_layer in self.conformer_layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_outputs = conformer_layer(hidden_states, pos_emb, attention_mask, output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+class FastSpeech2ConformerLoss(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__()
+        use_masking = config.use_masking
+        use_weighted_masking = config.use_weighted_masking
+        if use_masking and use_weighted_masking:
+            raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.duration_criterion = nn.MSELoss(reduction=reduction)
+        self.log_domain_offset = 1.0
+    def forward(
+        self,
+        outputs_after_postnet,
+        outputs_before_postnet,
+        duration_outputs,
+        pitch_outputs,
+        energy_outputs,
+        spectrogram_labels,
+        duration_labels,
+        pitch_labels,
+        energy_labels,
+        duration_mask,
+        spectrogram_mask,
+    ):
+        """
+        Args:
+            outputs_after_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of outputs after postnet.
+            outputs_before_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of outputs before postnet.
+            duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length)`):
+                Batch of outputs of duration predictor.
+            pitch_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of outputs of pitch predictor.
+            energy_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of outputs of energy predictor.
+            spectrogram_labels (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of target features.
+            duration_labels (`torch.LongTensor` of shape `(batch_size, max_text_length)`): Batch of durations.
+            pitch_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of target token-averaged pitch.
+            energy_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of target token-averaged energy.
+            duration_mask (`torch.LongTensor`):
+                Mask used to discern which values the duration loss should be calculated for.
+            spectrogram_mask (`torch.LongTensor`):
+                Mask used to discern which values the spectrogam loss should be calculated for.
+        Returns:
+            `tuple(torch.FloatTensor)`: Tuple of tensors containing, in order, the L1 loss value, duration predictor
+            loss value, pitch predictor loss value, and energy predictor loss value.
+        """
+        pitch_and_energy_masks = duration_mask.unsqueeze(-1)
+        # apply mask to remove padded part
+        if self.use_masking:
+            outputs_before_postnet = outputs_before_postnet.masked_select(spectrogram_mask)
+            if outputs_after_postnet is not None:
+                outputs_after_postnet = outputs_after_postnet.masked_select(spectrogram_mask)
+            spectrogram_labels = spectrogram_labels.masked_select(spectrogram_mask)
+            duration_outputs = duration_outputs.masked_select(duration_mask)
+            duration_labels = duration_labels.masked_select(duration_mask)
+            pitch_outputs = pitch_outputs.masked_select(pitch_and_energy_masks)
+            energy_outputs = energy_outputs.masked_select(pitch_and_energy_masks)
+            pitch_labels = pitch_labels.masked_select(pitch_and_energy_masks)
+            energy_labels = energy_labels.masked_select(pitch_and_energy_masks)
+        # calculate loss
+        l1_loss = self.l1_criterion(outputs_before_postnet, spectrogram_labels)
+        if outputs_after_postnet is not None:
+            l1_loss = l1_loss + self.l1_criterion(outputs_after_postnet, spectrogram_labels)
+        duration_labels = torch.log(duration_labels.float() + self.log_domain_offset)
+        duration_loss = self.duration_criterion(duration_outputs, duration_labels)
+        pitch_loss = self.mse_criterion(pitch_outputs, pitch_labels)
+        energy_loss = self.mse_criterion(energy_outputs, energy_labels)
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            spectrogram_mask = nn.functional.pad(
+                spectrogram_mask.transpose(1, 2),
+                [0, spectrogram_labels.size(1) - spectrogram_mask.size(1), 0, 0, 0, 0],
+                value=False,
+            ).transpose(1, 2)
+            out_weights = spectrogram_mask.float() / spectrogram_mask.sum(dim=1, keepdim=True).float()
+            out_weights /= spectrogram_labels.size(0) * spectrogram_labels.size(2)
+            duration_weights = duration_mask.float() / duration_mask.sum(dim=1, keepdim=True).float()
+            duration_weights /= duration_labels.size(0)
+            # apply weight
+            l1_loss = l1_loss.mul(out_weights).masked_select(spectrogram_mask).sum()
+            duration_loss = duration_loss.mul(duration_weights).masked_select(duration_mask).sum()
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
+            energy_loss = energy_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
+        return l1_loss + duration_loss + pitch_loss + energy_loss
+class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = FastSpeech2ConformerConfig
+    base_model_prefix = "fastspeech2_conformer"
+    main_input_name = "input_ids"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.LayerNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                key = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-key, b=key)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_()
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, FastSpeech2ConformerAttention):
+            nn.init.xavier_uniform_(module.pos_bias_u)
+            nn.init.xavier_uniform_(module.pos_bias_v)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, FastSpeech2ConformerEncoder):
+            module.gradient_checkpointing = value
+@add_start_docstrings(
+    """FastSpeech2Conformer Model.""",
+    FASTSPEECH2_CONFORMER_START_DOCSTRING,
+)
+class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
+    """
+    FastSpeech 2 module.
+    This is a module of FastSpeech 2 described in 'FastSpeech 2: Fast and High-Quality End-to-End Text to Speech'
+    https://arxiv.org/abs/2006.04558. Instead of quantized pitch and energy, we use token-averaged value introduced in
+    FastPitch: Parallel Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers instead of regular
+    Transformers.
+    """
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__(config)
+        self.config = config
+        # store hyperparameters
+        self.vocab_size = config.vocab_size
+        self.num_mel_bins = config.num_mel_bins
+        self.hidden_size = config.hidden_size
+        self.reduction_factor = config.reduction_factor
+        self.stop_gradient_from_pitch_predictor = config.stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = config.stop_gradient_from_energy_predictor
+        self.multilingual_model = config.num_languages is not None and config.num_languages > 1
+        if self.multilingual_model:
+            self.language_id_embedding = torch.nn.Embedding(config.num_languages, self.hidden_size)
+        self.multispeaker_model = config.num_speakers is not None and config.num_speakers > 1
+        if self.multispeaker_model:
+            self.speaker_id_embedding = torch.nn.Embedding(config.num_speakers, config.hidden_size)
+        self.speaker_embed_dim = config.speaker_embed_dim
+        if self.speaker_embed_dim:
+            self.projection = nn.Linear(config.hidden_size + self.speaker_embed_dim, config.hidden_size)
+        self.encoder = FastSpeech2ConformerEncoder(config, config.encoder_config, use_encoder_input_layer=True)
+        self.duration_predictor = FastSpeech2ConformerDurationPredictor(config)
+        self.pitch_predictor = FastSpeech2ConformerVariancePredictor(
+            config,
+            num_layers=config.pitch_predictor_layers,
+            num_chans=config.pitch_predictor_channels,
+            kernel_size=config.pitch_predictor_kernel_size,
+            dropout_rate=config.pitch_predictor_dropout,
+        )
+        # continuous pitch + FastPitch style avg
+        self.pitch_embed = FastSpeech2ConformerVarianceEmbedding(
+            out_channels=self.hidden_size,
+            kernel_size=config.pitch_embed_kernel_size,
+            padding=(config.pitch_embed_kernel_size - 1) // 2,
+            dropout_rate=config.pitch_embed_dropout,
+        )
+        self.energy_predictor = FastSpeech2ConformerVariancePredictor(
+            config,
+            num_layers=config.energy_predictor_layers,
+            num_chans=config.energy_predictor_channels,
+            kernel_size=config.energy_predictor_kernel_size,
+            dropout_rate=config.energy_predictor_dropout,
+        )
+        # continuous energy + FastPitch style avg
+        self.energy_embed = FastSpeech2ConformerVarianceEmbedding(
+            out_channels=self.hidden_size,
+            kernel_size=config.energy_embed_kernel_size,
+            padding=(config.energy_embed_kernel_size - 1) // 2,
+            dropout_rate=config.energy_embed_dropout,
+        )
+        # The decoder is an encoder
+        self.decoder = FastSpeech2ConformerEncoder(config, config.decoder_config, use_encoder_input_layer=False)
+        self.speech_decoder_postnet = FastSpeech2ConformerSpeechDecoderPostnet(config)
+        self.criterion = FastSpeech2ConformerLoss(config)
+        self.post_init()
+    @replace_return_docstrings(output_type=FastSpeech2ConformerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        spectrogram_labels: Optional[torch.FloatTensor] = None,
+        duration_labels: Optional[torch.LongTensor] = None,
+        pitch_labels: Optional[torch.FloatTensor] = None,
+        energy_labels: Optional[torch.FloatTensor] = None,
+        speaker_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        speaker_embedding: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[Tuple, FastSpeech2ConformerModelOutput]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Input sequence of text vectors.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to `None`):
+                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
+                `[0, 1]`: 0 for tokens that are **masked**, 1 for tokens that are **not masked**.
+            spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
+                Batch of padded target features.
+            duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
+                Batch of padded durations.
+            pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged pitch.
+            energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged energy.
+            speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Speaker ids used to condition features of speech output by the model.
+            lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Language ids used to condition features of speech output by the model.
+            speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
+                Embedding containing conditioning signals for the features of the speech.
+            return_dict (`bool`, *optional*, defaults to `None`):
+                Whether or not to return a [`FastSpeech2ConformerModelOutput`] instead of a plain tuple.
+            output_attentions (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import (
+        ...     FastSpeech2ConformerTokenizer,
+        ...     FastSpeech2ConformerModel,
+        ...     FastSpeech2ConformerHifiGan,
+        ... )
+        >>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        >>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
+        >>> input_ids = inputs["input_ids"]
+        >>> model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
+        >>> output_dict = model(input_ids, return_dict=True)
+        >>> spectrogram = output_dict["spectrogram"]
+        >>> vocoder = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan")
+        >>> waveform = vocoder(spectrogram)
+        >>> print(waveform.shape)
+        torch.Size([1, 49664])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones(input_ids.shape)
+        has_missing_labels = (
+            spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
+        )
+        if self.training and has_missing_labels:
+            raise ValueError("All labels must be provided to run in training mode.")
+        # forward encoder
+        text_masks = attention_mask.unsqueeze(-2)
+        encoder_outputs = self.encoder(
+            input_ids,
+            text_masks,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        # Integrate with language id, speaker id, and speaker embedding
+        if self.multispeaker_model and speaker_ids is not None:
+            speaker_id_embeddings = self.speaker_id_embedding(speaker_ids.view(-1))
+            hidden_states = hidden_states + speaker_id_embeddings.unsqueeze(1)
+        if self.multilingual_model and lang_ids is not None:
+            language_id_embbedings = self.language_id_embedding(lang_ids.view(-1))
+            hidden_states = hidden_states + language_id_embbedings.unsqueeze(1)
+        if self.speaker_embed_dim is not None and speaker_embedding is not None:
+            embeddings_expanded = (
+                nn.functional.normalize(speaker_embedding).unsqueeze(1).expand(-1, hidden_states.size(1), -1)
+            )
+            hidden_states = self.projection(torch.cat([hidden_states, embeddings_expanded], dim=-1))
+        # forward duration predictor and variance predictors
+        duration_mask = ~attention_mask.bool()
+        if self.stop_gradient_from_pitch_predictor:
+            pitch_predictions = self.pitch_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
+        else:
+            pitch_predictions = self.pitch_predictor(hidden_states, duration_mask.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            energy_predictions = self.energy_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
+        else:
+            energy_predictions = self.energy_predictor(hidden_states, duration_mask.unsqueeze(-1))
+        duration_predictions = self.duration_predictor(hidden_states)
+        duration_predictions = duration_predictions.masked_fill(duration_mask, 0.0)
+        if not self.training:
+            # use prediction in inference
+            embedded_pitch_curve = self.pitch_embed(pitch_predictions)
+            embedded_energy_curve = self.energy_embed(energy_predictions)
+            hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
+            hidden_states = length_regulator(hidden_states, duration_predictions, self.config.speaking_speed)
+        else:
+            # use groundtruth in training
+            embedded_pitch_curve = self.pitch_embed(pitch_labels)
+            embedded_energy_curve = self.energy_embed(energy_labels)
+            hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
+            hidden_states = length_regulator(hidden_states, duration_labels)
+        # forward decoder
+        if not self.training:
+            hidden_mask = None
+        else:
+            spectrogram_mask = (spectrogram_labels != -100).any(dim=-1)
+            spectrogram_mask = spectrogram_mask.int()
+            if self.reduction_factor > 1:
+                length_dim = spectrogram_mask.shape[1] - spectrogram_mask.shape[1] % self.reduction_factor
+                spectrogram_mask = spectrogram_mask[:, :, :length_dim]
+            hidden_mask = spectrogram_mask.unsqueeze(-2)
+        decoder_outputs = self.decoder(
+            hidden_states,
+            hidden_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        outputs_before_postnet, outputs_after_postnet = self.speech_decoder_postnet(decoder_outputs[0])
+        loss = None
+        if self.training:
+            # calculate loss
+            loss_duration_mask = ~duration_mask
+            loss_spectrogram_mask = spectrogram_mask.unsqueeze(-1).bool()
+            loss = self.criterion(
+                outputs_after_postnet=outputs_after_postnet,
+                outputs_before_postnet=outputs_before_postnet,
+                duration_outputs=duration_predictions,
+                pitch_outputs=pitch_predictions,
+                energy_outputs=energy_predictions,
+                spectrogram_labels=spectrogram_labels,
+                duration_labels=duration_labels,
+                pitch_labels=pitch_labels,
+                energy_labels=energy_labels,
+                duration_mask=loss_duration_mask,
+                spectrogram_mask=loss_spectrogram_mask,
+            )
+        if not return_dict:
+            postnet_outputs = (outputs_after_postnet,)
+            audio_feature_predictions = (
+                duration_predictions,
+                pitch_predictions,
+                energy_predictions,
+            )
+            outputs = postnet_outputs + encoder_outputs + decoder_outputs[1:] + audio_feature_predictions
+            return ((loss,) + outputs) if loss is not None else outputs
+        return FastSpeech2ConformerModelOutput(
+            loss=loss,
+            spectrogram=outputs_after_postnet,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            duration_outputs=duration_predictions,
+            pitch_outputs=pitch_predictions,
+            energy_outputs=energy_predictions,
+        )
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+    def apply_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.weight_norm(layer)
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+@add_start_docstrings(
+    """HiFi-GAN vocoder.""",
+    HIFIGAN_START_DOCSTRING,
+)
+# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5HifiGan with SpeechT5->FastSpeech2Conformer
+class FastSpeech2ConformerHifiGan(PreTrainedModel):
+    config_class = FastSpeech2ConformerHifiGanConfig
+    main_input_name = "spectrogram"
+    def __init__(self, config: FastSpeech2ConformerHifiGanConfig):
+        super().__init__(config)
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+        self.register_buffer("mean", torch.zeros(config.model_in_dim))
+        self.register_buffer("scale", torch.ones(config.model_in_dim))
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.weight_norm(layer)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+        nn.utils.weight_norm(self.conv_post)
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.conv_post)
+    def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+        Args:
+            spectrogram (`torch.FloatTensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+        if self.config.normalize_before:
+            spectrogram = (spectrogram - self.mean) / self.scale
+        is_batched = spectrogram.dim() == 3
+        if not is_batched:
+            spectrogram = spectrogram.unsqueeze(0)
+        hidden_states = spectrogram.transpose(2, 1)
+        hidden_states = self.conv_pre(hidden_states)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        if not is_batched:
+            # remove batch dim and collapse tensor to 1-d audio waveform
+            waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
+        else:
+            # remove seq-len dim since this collapses to 1
+            waveform = hidden_states.squeeze(1)
+        return waveform
+@add_start_docstrings(
+    "The FastSpeech2ConformerModel with a FastSpeech2ConformerHifiGan vocoder head that performs text-to-speech (waveform).",
+    FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING,
+)
+class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
+    config_class = FastSpeech2ConformerWithHifiGanConfig
+    def __init__(self, config: FastSpeech2ConformerWithHifiGanConfig):
+        super().__init__(config)
+        self.model = FastSpeech2ConformerModel(config.model_config)
+        self.vocoder = FastSpeech2ConformerHifiGan(config.vocoder_config)
+        self.config = config
+    @replace_return_docstrings(
+        output_type=FastSpeech2ConformerWithHifiGanOutput, config_class=FastSpeech2ConformerWithHifiGanConfig
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        spectrogram_labels: Optional[torch.FloatTensor] = None,
+        duration_labels: Optional[torch.LongTensor] = None,
+        pitch_labels: Optional[torch.FloatTensor] = None,
+        energy_labels: Optional[torch.FloatTensor] = None,
+        speaker_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        speaker_embedding: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[Tuple, FastSpeech2ConformerModelOutput]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Input sequence of text vectors.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to `None`):
+                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
+                `[0, 1]`: 0 for tokens that are **masked**, 1 for tokens that are **not masked**.
+            spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
+                Batch of padded target features.
+            duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
+                Batch of padded durations.
+            pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged pitch.
+            energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+                Batch of padded token-averaged energy.
+            speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Speaker ids used to condition features of speech output by the model.
+            lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+                Language ids used to condition features of speech output by the model.
+            speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
+                Embedding containing conditioning signals for the features of the speech.
+            return_dict (`bool`, *optional*, defaults to `None`):
+                Whether or not to return a [`FastSpeech2ConformerModelOutput`] instead of a plain tuple.
+            output_attentions (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import (
+        ...     FastSpeech2ConformerTokenizer,
+        ...     FastSpeech2ConformerWithHifiGan,
+        ... )
+        >>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        >>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
+        >>> input_ids = inputs["input_ids"]
+        >>> model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan")
+        >>> output_dict = model(input_ids, return_dict=True)
+        >>> waveform = output_dict["waveform"]
+        >>> print(waveform.shape)
+        torch.Size([1, 49664])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.model_config.use_return_dict
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.model_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.model_config.output_hidden_states
+        )
+        model_outputs = self.model(
+            input_ids,
+            attention_mask,
+            spectrogram_labels=spectrogram_labels,
+            duration_labels=duration_labels,
+            pitch_labels=pitch_labels,
+            energy_labels=energy_labels,
+            speaker_ids=speaker_ids,
+            lang_ids=lang_ids,
+            speaker_embedding=speaker_embedding,
+            return_dict=return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        if not return_dict:
+            has_missing_labels = (
+                spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
+            )
+            if has_missing_labels:
+                spectrogram = model_outputs[0]
+            else:
+                spectrogram = model_outputs[1]
+        else:
+            spectrogram = model_outputs["spectrogram"]
+        waveform = self.vocoder(spectrogram)
+        if not return_dict:
+            return model_outputs + (waveform,)
+        return FastSpeech2ConformerWithHifiGanOutput(waveform=waveform, **model_outputs)
--- a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for FastSpeech2Conformer."""
+import json
+import os
+from typing import Optional, Tuple
+import regex
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging, requires_backends
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/vocab.json",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    # Set to somewhat arbitrary large number as the model input
+    # isn't constrained by the relative positional encoding
+    "espnet/fastspeech2_conformer": 4096,
+}
+class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
+    """
+    Construct a FastSpeech2Conformer tokenizer.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
+            The begin of sequence token. Note that for FastSpeech2, it is the same as the `eos_token`.
+        eos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
+            The end of sequence token. Note that for FastSpeech2, it is the same as the `bos_token`.
+        pad_token (`str`, *optional*, defaults to `"<blank>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        should_strip_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to strip the spaces from the list of tokens.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<sos/eos>",
+        eos_token="<sos/eos>",
+        pad_token="<blank>",
+        unk_token="<unk>",
+        should_strip_spaces=False,
+        **kwargs,
+    ):
+        requires_backends(self, "g2p_en")
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        import g2p_en
+        self.g2p = g2p_en.G2p()
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            should_strip_spaces=should_strip_spaces,
+            **kwargs,
+        )
+        self.should_strip_spaces = should_strip_spaces
+    @property
+    def vocab_size(self):
+        return len(self.decoder)
+    def get_vocab(self):
+        "Returns vocab as a dict"
+        return dict(self.encoder, **self.added_tokens_encoder)
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        # expand symbols
+        text = regex.sub(";", ",", text)
+        text = regex.sub(":", ",", text)
+        text = regex.sub("-", " ", text)
+        text = regex.sub("&", "and", text)
+        # strip unnecessary symbols
+        text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)
+        # strip whitespaces
+        text = regex.sub(r"\s+", " ", text)
+        text = text.upper()
+        return text, kwargs
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        # phonemize
+        tokens = self.g2p(text)
+        if self.should_strip_spaces:
+            tokens = list(filter(lambda s: s != " ", tokens))
+        tokens.append(self.eos_token)
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+    # Override since phonemes cannot be converted back to strings
+    def decode(self, token_ids, **kwargs):
+        logger.warn(
+            "Phonemes cannot be reliably converted to a string due to the one-many mapping, converting to tokens instead."
+        )
+        return self.convert_ids_to_tokens(token_ids)
+    # Override since phonemes cannot be converted back to strings
+    def convert_tokens_to_string(self, tokens, **kwargs):
+        logger.warn(
+            "Phonemes cannot be reliably converted to a string due to the one-many mapping, returning the tokens."
+        )
+        return tokens
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.get_vocab(), ensure_ascii=False))
+        return (vocab_file,)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["g2p"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import g2p_en
+            self.g2p = g2p_en.G2p()
+        except ImportError:
+            raise ImportError(
+                "You need to install g2p-en to use FastSpeech2ConformerTokenizer. "
+                "See https://pypi.org/project/g2p-en/ for installation."
+            )
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -2655,6 +2655,7 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
        return_dict: Optional[bool] = None,
        speaker_embeddings: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.FloatTensor] = None,
+        stop_labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
        r"""
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -2973,6 +2974,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
        return_dict: Optional[bool] = None,
        speaker_embeddings: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.FloatTensor] = None,
+        stop_labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):

--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -67,6 +67,7 @@ from .utils import (
    is_flax_available,
    is_fsdp_available,
    is_ftfy_available,
+    is_g2p_en_available,
    is_ipex_available,
    is_jieba_available,
    is_jinja_available,
@@ -365,6 +366,13 @@ def require_fsdp(test_case, min_version: str = "1.12.0"):
    )
+def require_g2p_en(test_case):
+    """
+    Decorator marking a test that requires g2p_en. These tests are skipped when SentencePiece isn't installed.
+    """
+    return unittest.skipUnless(is_g2p_en_available(), "test requires g2p_en")(test_case)
 def require_safetensors(test_case):
    """
    Decorator marking a test that requires safetensors. These tests are skipped when safetensors isn't installed.

--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -123,6 +123,7 @@ from .import_utils import (
    is_flax_available,
    is_fsdp_available,
    is_ftfy_available,
+    is_g2p_en_available,
    is_in_notebook,
    is_ipex_available,
    is_jieba_available,

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3422,6 +3422,37 @@ class FalconPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class FastSpeech2ConformerHifiGan(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class FastSpeech2ConformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class FastSpeech2ConformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class FastSpeech2ConformerWithHifiGan(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None

--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -94,6 +94,7 @@ except importlib.metadata.PackageNotFoundError:
    except importlib.metadata.PackageNotFoundError:
        _faiss_available = False
 _ftfy_available = _is_package_available("ftfy")
+_g2p_en_available = _is_package_available("g2p_en")
 _ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
 _jieba_available = _is_package_available("jieba")
 _jinja_available = _is_package_available("jinja2")
@@ -444,6 +445,10 @@ def is_ftfy_available():
    return _ftfy_available
+def is_g2p_en_available():
+    return _g2p_en_available
 @lru_cache()
 def is_torch_tpu_available(check_device=True):
    "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
@@ -1059,6 +1064,12 @@ LEVENSHTEIN_IMPORT_ERROR = """
 install python-Levenshtein`. Please note that you may need to restart your runtime after installation.
 """
+# docstyle-ignore
+G2P_EN_IMPORT_ERROR = """
+{0} requires the g2p-en library but it was not found in your environment. You can install it with pip:
+`pip install g2p-en`. Please note that you may need to restart your runtime after installation.
+"""
 # docstyle-ignore
 PYTORCH_QUANTIZATION_IMPORT_ERROR = """
 {0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
@@ -1101,7 +1112,6 @@ SACREMOSES_IMPORT_ERROR = """
 `pip install sacremoses`. Please note that you may need to restart your runtime after installation.
 """
 # docstyle-ignore
 SCIPY_IMPORT_ERROR = """
 {0} requires the scipy library but it was not found in your environment. You can install it with pip:
@@ -1225,6 +1235,7 @@ BACKENDS_MAPPING = OrderedDict(
        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+        ("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
        ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
        ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
        ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),

--- a/tests/models/fastspeech2_conformer/__init__.py
+++ b/tests/models/fastspeech2_conformer/__init__.py
--- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch FastSpeech2Conformer model."""
+import inspect
+import tempfile
+import unittest
+from transformers import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerHifiGanConfig,
+    FastSpeech2ConformerTokenizer,
+    FastSpeech2ConformerWithHifiGanConfig,
+    is_torch_available,
+)
+from transformers.testing_utils import require_g2p_en, require_torch, slow, torch_device
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+if is_torch_available():
+    import torch
+    from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerWithHifiGan, set_seed
+class FastSpeech2ConformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_hidden_layers=1,
+        num_attention_heads=2,
+        hidden_size=24,
+        seq_length=7,
+        encoder_linear_units=384,
+        decoder_linear_units=384,
+        is_training=False,
+        speech_decoder_postnet_units=128,
+        speech_decoder_postnet_layers=2,
+        pitch_predictor_layers=1,
+        energy_predictor_layers=1,
+        duration_predictor_layers=1,
+        num_mel_bins=8,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.vocab_size = hidden_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.encoder_linear_units = encoder_linear_units
+        self.decoder_linear_units = decoder_linear_units
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.pitch_predictor_layers = pitch_predictor_layers
+        self.energy_predictor_layers = energy_predictor_layers
+        self.duration_predictor_layers = duration_predictor_layers
+        self.num_mel_bins = num_mel_bins
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        return config, input_ids
+    def get_config(self):
+        return FastSpeech2ConformerConfig(
+            hidden_size=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_linear_units=self.encoder_linear_units,
+            decoder_linear_units=self.decoder_linear_units,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            num_mel_bins=self.num_mel_bins,
+            pitch_predictor_layers=self.pitch_predictor_layers,
+            energy_predictor_layers=self.energy_predictor_layers,
+            duration_predictor_layers=self.duration_predictor_layers,
+        )
+    def create_and_check_model(self, config, input_ids, *args):
+        model = FastSpeech2ConformerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, return_dict=True)
+        # total of 5 keys in result
+        self.parent.assertEqual(len(result), 5)
+        # check batch sizes match
+        for value in result.values():
+            self.parent.assertEqual(value.size(0), self.batch_size)
+        # check duration, pitch, and energy have the appopriate shapes
+        # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
+        self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
+        self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
+        # check predicted mel-spectrogram has correct dimension
+        self.parent.assertEqual(result["spectrogram"].size(2), model.config.num_mel_bins)
+    def prepare_config_and_inputs_for_common(self):
+        config, input_ids = self.prepare_config_and_inputs()
+        inputs_dict = {"input_ids": input_ids}
+        return config, inputs_dict
+@require_torch
+class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FastSpeech2ConformerModel,) if is_torch_available() else ()
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+    def setUp(self):
+        self.model_tester = FastSpeech2ConformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FastSpeech2ConformerConfig)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_initialization(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    msg = f"Parameter {name} of model {model_class} seems not properly initialized"
+                    if "norm" in name:
+                        if "bias" in name:
+                            self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
+                        if "weight" in name:
+                            self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
+                    elif "conv" in name or "embed" in name:
+                        self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
+    def test_duration_energy_pitch_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        seq_len = self.model_tester.seq_length
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            # duration
+            self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len])
+            # energy
+            self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
+            # pitch
+            self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
+    def test_hidden_states_output(self):
+        def _check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]):
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                self.assertIsInstance(hidden_states, (list, tuple))
+                expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape
+                self.assertEqual(expected_batch_size, self.model_tester.batch_size)
+                # Only test encoder seq_length since decoder seq_length is variable based on inputs
+                if idx == 0:
+                    self.assertEqual(expected_seq_length, self.model_tester.seq_length)
+                self.assertEqual(expected_hidden_size, self.model_tester.hidden_size)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        inputs_dict["output_hidden_states"] = True
+        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel)
+        # check that output_hidden_states also work using config
+        del inputs_dict["output_hidden_states"]
+        config.output_hidden_states = True
+        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel)
+    def test_save_load_strict(self):
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        model = FastSpeech2ConformerModel(config)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            _, info = FastSpeech2ConformerModel.from_pretrained(tmpdirname, output_loading_info=True)
+        self.assertEqual(info["missing_keys"], [])
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        model = FastSpeech2ConformerModel(config)
+        signature = inspect.signature(model.forward)
+        # signature.parameters is an OrderedDict => so arg_names order is deterministic
+        arg_names = [*signature.parameters.keys()]
+        expected_arg_names = [
+            "input_ids",
+            "attention_mask",
+            "spectrogram_labels",
+            "duration_labels",
+            "pitch_labels",
+            "energy_labels",
+            "speaker_ids",
+            "lang_ids",
+            "speaker_embedding",
+            "return_dict",
+            "output_attentions",
+            "output_hidden_states",
+        ]
+        self.assertListEqual(arg_names, expected_arg_names)
+    # Override as FastSpeech2Conformer does not output cross attentions
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+        model = FastSpeech2ConformerModel(config)
+        model.to(torch_device)
+        model.eval()
+        inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel)
+        outputs = model(**inputs)
+        output = outputs[0]
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_hidden_states.retain_grad()
+        decoder_hidden_states = outputs.decoder_hidden_states[0]
+        decoder_hidden_states.retain_grad()
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_attentions.retain_grad()
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+        output.flatten()[0].backward(retain_graph=True)
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(decoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+    def test_attention_outputs(self):
+        """
+        Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable
+        decoder attention shape, and uniquely outputs energy, pitch, and durations.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        seq_len = self.model_tester.seq_length
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers)
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            encoder_attentions = outputs.encoder_attentions
+            self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(encoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+            out_len = len(outputs)
+            correct_outlen = 7
+            self.assertEqual(out_len, correct_outlen)
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+            self_attentions = outputs.encoder_attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+    @slow
+    def test_model_from_pretrained(self):
+        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
+        self.assertIsNotNone(model)
+    @unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="FastSpeech2Conformer has no input embeddings")
+    def test_model_common_attributes(self):
+        pass
+@require_torch
+@require_g2p_en
+@slow
+class FastSpeech2ConformerModelIntegrationTest(unittest.TestCase):
+    def test_inference_integration(self):
+        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
+        model.to(torch_device)
+        model.eval()
+        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        text = "Test that this generates speech"
+        input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"]
+        outputs_dict = model(input_ids)
+        spectrogram = outputs_dict["spectrogram"]
+        # mel-spectrogram is too large (1, 205, 80), so only check top-left 100 elements
+        # fmt: off
+        expected_mel_spectrogram = torch.tensor(
+            [
+                [-1.2426, -1.7286, -1.6754, -1.7451, -1.6402, -1.5219, -1.4480, -1.3345, -1.4031, -1.4497],
+                [-0.7858, -1.4966, -1.3602, -1.4876, -1.2949, -1.0723, -1.0021, -0.7553, -0.6521, -0.6929],
+                [-0.7298, -1.3908, -1.0369, -1.2656, -1.0342, -0.7883, -0.7420, -0.5249, -0.3734, -0.3977],
+                [-0.4784, -1.3508, -1.1558, -1.4678, -1.2820, -1.0252, -1.0868, -0.9006, -0.8947, -0.8448],
+                [-0.3963, -1.2895, -1.2813, -1.6147, -1.4658, -1.2560, -1.4134, -1.2650, -1.3255, -1.1715],
+                [-1.4914, -1.3097, -0.3821, -0.3898, -0.5748, -0.9040, -1.0755, -1.0575, -1.2205, -1.0572],
+                [0.0197, -0.0582, 0.9147, 1.1512, 1.1651, 0.6628, -0.1010, -0.3085, -0.2285, 0.2650],
+                [1.1780, 0.1803, 0.7251, 1.5728, 1.6678, 0.4542, -0.1572, -0.1787, 0.0744, 0.8168],
+                [-0.2078, -0.3211, 1.1096, 1.5085, 1.4632, 0.6299, -0.0515, 0.0589, 0.8609, 1.4429],
+                [0.7831, -0.2663, 1.0352, 1.4489, 0.9088, 0.0247, -0.3995, 0.0078, 1.2446, 1.6998],
+            ],
+            device=torch_device,
+        )
+        # fmt: on
+        self.assertTrue(torch.allclose(spectrogram[0, :10, :10], expected_mel_spectrogram, atol=1e-4))
+        self.assertEqual(spectrogram.shape, (1, 205, model.config.num_mel_bins))
+    def test_training_integration(self):
+        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
+        model.to(torch_device)
+        # Set self.training manually to keep deterministic but run the training path
+        model.training = True
+        set_seed(0)
+        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        text = "Test that this generates speech"
+        input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"]
+        # NOTE: Dummy numbers since FastSpeech2Conformer does not have a feature extractor due to the package deps required (librosa, MFA)
+        batch_size, max_text_len = input_ids.shape
+        pitch_labels = torch.rand((batch_size, max_text_len, 1), dtype=torch.float, device=torch_device)
+        energy_labels = torch.rand((batch_size, max_text_len, 1), dtype=torch.float, device=torch_device)
+        duration_labels = torch.normal(10, 2, size=(batch_size, max_text_len)).clamp(1, 20).int()
+        max_target_len, _ = duration_labels.sum(dim=1).max(dim=0)
+        max_target_len = max_target_len.item()
+        spectrogram_labels = torch.rand(
+            (batch_size, max_target_len, model.num_mel_bins), dtype=torch.float, device=torch_device
+        )
+        outputs_dict = model(
+            input_ids,
+            spectrogram_labels=spectrogram_labels,
+            duration_labels=duration_labels,
+            pitch_labels=pitch_labels,
+            energy_labels=energy_labels,
+            return_dict=True,
+        )
+        spectrogram = outputs_dict["spectrogram"]
+        loss = outputs_dict["loss"]
+        # # mel-spectrogram is too large (1, 224, 80), so only check top-left 100 elements
+        # fmt: off
+        expected_mel_spectrogram = torch.tensor(
+            [
+                [-1.0643e+00, -6.8058e-01, -1.0901e+00, -8.2724e-01, -7.7241e-01, -1.1905e+00, -8.5725e-01, -8.2930e-01, -1.1313e+00, -1.2449e+00],
+                [-5.5067e-01, -2.7045e-01, -6.3483e-01, -1.9320e-01,  1.0234e-01, -3.3253e-01, -2.4423e-01, -3.5045e-01, -5.2070e-01, -4.3710e-01],
+                [ 2.2181e-01,  3.1433e-01, -1.2849e-01,  6.0253e-01,  1.0033e+00, 1.3952e-01,  1.2851e-01, -2.3063e-02, -1.5092e-01,  2.4903e-01],
+                [ 4.6343e-01,  4.1820e-01,  1.6468e-01,  1.1297e+00,  1.4588e+00, 1.3737e-01,  6.6355e-02, -6.0973e-02, -5.4225e-02,  5.9208e-01],
+                [ 5.2762e-01,  4.8725e-01,  4.2735e-01,  1.4392e+00,  1.7398e+00, 2.4891e-01, -8.4531e-03, -8.1282e-02,  1.2857e-01,  8.7559e-01],
+                [ 5.2548e-01,  5.1653e-01,  5.2034e-01,  1.3782e+00,  1.5972e+00, 1.6380e-01, -5.1807e-02,  1.5474e-03,  2.2824e-01,  8.5288e-01],
+                [ 3.6356e-01,  4.4109e-01,  4.4257e-01,  9.4273e-01,  1.1201e+00, -9.0551e-03, -1.1627e-01, -2.0821e-02,  1.0793e-01,  5.0336e-01],
+                [ 3.6598e-01,  3.2708e-01,  1.3297e-01,  4.5162e-01,  6.4168e-01, -2.6923e-01, -2.3101e-01, -1.4943e-01, -1.4732e-01,  7.3057e-02],
+                [ 2.7639e-01,  2.2588e-01, -1.5310e-01,  1.0957e-01,  3.3048e-01, -5.3431e-01, -3.3822e-01, -2.8007e-01, -3.3823e-01, -1.5775e-01],
+                [ 2.9323e-01,  1.6723e-01, -3.4153e-01, -1.1209e-01,  1.7355e-01, -6.1724e-01, -5.4201e-01, -4.9944e-01, -5.2212e-01, -2.7596e-01]
+            ],
+            device=torch_device,
+        )
+        # fmt: on
+        expected_loss = torch.tensor(74.4595, device=torch_device)
+        self.assertTrue(torch.allclose(spectrogram[0, :10, :10], expected_mel_spectrogram, atol=1e-3))
+        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4))
+        self.assertEqual(spectrogram.shape, (1, 224, model.config.num_mel_bins))
+class FastSpeech2ConformerWithHifiGanTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_hidden_layers=1,
+        num_attention_heads=2,
+        hidden_size=24,
+        seq_length=7,
+        encoder_linear_units=384,
+        decoder_linear_units=384,
+        is_training=False,
+        speech_decoder_postnet_units=128,
+        speech_decoder_postnet_layers=2,
+        pitch_predictor_layers=1,
+        energy_predictor_layers=1,
+        duration_predictor_layers=1,
+        num_mel_bins=8,
+        upsample_initial_channel=64,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.vocab_size = hidden_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.encoder_linear_units = encoder_linear_units
+        self.decoder_linear_units = decoder_linear_units
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.pitch_predictor_layers = pitch_predictor_layers
+        self.energy_predictor_layers = energy_predictor_layers
+        self.duration_predictor_layers = duration_predictor_layers
+        self.num_mel_bins = num_mel_bins
+        self.upsample_initial_channel = upsample_initial_channel
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        return config, input_ids
+    def get_config(self):
+        self.model_config = FastSpeech2ConformerConfig(
+            hidden_size=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_linear_units=self.encoder_linear_units,
+            decoder_linear_units=self.decoder_linear_units,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            num_mel_bins=self.num_mel_bins,
+            pitch_predictor_layers=self.pitch_predictor_layers,
+            energy_predictor_layers=self.energy_predictor_layers,
+            duration_predictor_layers=self.duration_predictor_layers,
+        )
+        self.vocoder_config = FastSpeech2ConformerHifiGanConfig(
+            model_in_dim=self.num_mel_bins, upsample_initial_channel=self.upsample_initial_channel
+        )
+        return FastSpeech2ConformerWithHifiGanConfig(
+            model_config=self.model_config.to_dict(), vocoder_config=self.vocoder_config.to_dict()
+        )
+    def create_and_check_model(self, config, input_ids, *args):
+        model = FastSpeech2ConformerWithHifiGan(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, return_dict=True)
+        # total of 5 keys in result
+        self.parent.assertEqual(len(result), 6)
+        # check batch sizes match
+        for value in result.values():
+            self.parent.assertEqual(value.size(0), self.batch_size)
+        # check duration, pitch, and energy have the appopriate shapes
+        # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
+        self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
+        self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
+        # check predicted mel-spectrogram has correct dimension
+        self.parent.assertEqual(result["spectrogram"].size(2), model.config.model_config.num_mel_bins)
+    def prepare_config_and_inputs_for_common(self):
+        config, input_ids = self.prepare_config_and_inputs()
+        inputs_dict = {"input_ids": input_ids}
+        return config, inputs_dict
+class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FastSpeech2ConformerWithHifiGan,) if is_torch_available() else ()
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+    def setUp(self):
+        self.model_tester = FastSpeech2ConformerWithHifiGanTester(self)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_initialization(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    msg = f"Parameter {name} of model {model_class} seems not properly initialized"
+                    if "norm" in name:
+                        if "bias" in name:
+                            self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
+                        if "weight" in name:
+                            self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
+                    elif "conv" in name or "embed" in name:
+                        self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        return inputs_dict
+    def test_duration_energy_pitch_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.model_config.return_dict = True
+        seq_len = self.model_tester.seq_length
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            # duration
+            self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len])
+            # energy
+            self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
+            # pitch
+            self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
+    def test_hidden_states_output(self):
+        def _check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]):
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                self.assertIsInstance(hidden_states, (list, tuple))
+                expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape
+                self.assertEqual(expected_batch_size, self.model_tester.batch_size)
+                # Only test encoder seq_length since decoder seq_length is variable based on inputs
+                if idx == 0:
+                    self.assertEqual(expected_seq_length, self.model_tester.seq_length)
+                self.assertEqual(expected_hidden_size, self.model_tester.hidden_size)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        inputs_dict["output_hidden_states"] = True
+        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan)
+        # check that output_hidden_states also work using config
+        del inputs_dict["output_hidden_states"]
+        config.model_config.output_hidden_states = True
+        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan)
+    def test_save_load_strict(self):
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        model = FastSpeech2ConformerWithHifiGan(config)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            _, info = FastSpeech2ConformerWithHifiGan.from_pretrained(tmpdirname, output_loading_info=True)
+        self.assertEqual(info["missing_keys"], [])
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        model = FastSpeech2ConformerWithHifiGan(config)
+        signature = inspect.signature(model.forward)
+        # signature.parameters is an OrderedDict => so arg_names order is deterministic
+        arg_names = [*signature.parameters.keys()]
+        expected_arg_names = [
+            "input_ids",
+            "attention_mask",
+            "spectrogram_labels",
+            "duration_labels",
+            "pitch_labels",
+            "energy_labels",
+            "speaker_ids",
+            "lang_ids",
+            "speaker_embedding",
+            "return_dict",
+            "output_attentions",
+            "output_hidden_states",
+        ]
+        self.assertListEqual(arg_names, expected_arg_names)
+    # Override as FastSpeech2Conformer does not output cross attentions
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.model_config.output_hidden_states = True
+        config.model_config.output_attentions = True
+        model = FastSpeech2ConformerWithHifiGan(config)
+        model.to(torch_device)
+        model.eval()
+        inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel)
+        outputs = model(**inputs)
+        output = outputs[0]
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_hidden_states.retain_grad()
+        decoder_hidden_states = outputs.decoder_hidden_states[0]
+        decoder_hidden_states.retain_grad()
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_attentions.retain_grad()
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+        output.flatten()[0].backward(retain_graph=True)
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(decoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+    def test_attention_outputs(self):
+        """
+        Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable
+        decoder attention shape, and uniquely outputs energy, pitch, and durations.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.model_config.return_dict = True
+        seq_len = self.model_tester.seq_length
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.model_config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers)
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.model_config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            encoder_attentions = outputs.encoder_attentions
+            self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(encoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+            out_len = len(outputs)
+            correct_outlen = 8
+            self.assertEqual(out_len, correct_outlen)
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+            self_attentions = outputs.encoder_attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+    @slow
+    def test_model_from_pretrained(self):
+        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
+        self.assertIsNotNone(model)
+    @unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="FastSpeech2Conformer has no input embeddings")
+    def test_model_common_attributes(self):
+        pass
+@require_torch
+@require_g2p_en
+@slow
+class FastSpeech2ConformerWithHifiGanIntegrationTest(unittest.TestCase):
+    def test_inference_integration(self):
+        model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan")
+        model.to(torch_device)
+        model.eval()
+        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        text = "Test that this generates speech"
+        input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"]
+        output = model(input_ids)
+        waveform = output.waveform
+        # waveform is too large (1, 52480), so only check first 100 elements
+        # fmt: off
+        expected_waveform = torch.tensor(
+            [
+                [-9.6345e-04,  1.3557e-03,  5.7559e-04,  2.4706e-04,  2.2675e-04, 1.2258e-04,  4.7784e-04,  1.0109e-03, -1.9718e-04,  6.3495e-04, 3.2106e-04,  6.3620e-05,  9.1713e-04, -2.5664e-05,  1.9596e-04, 6.0418e-04,  8.1112e-04,  3.6342e-04, -6.3396e-04, -2.0146e-04, -1.1768e-04,  4.3155e-04,  7.5599e-04, -2.2972e-04, -9.5665e-05, 3.3078e-04,  1.3793e-04, -1.4932e-04, -3.9645e-04,  3.6473e-05, -1.7224e-04, -4.5370e-05, -4.8950e-04, -4.3059e-04,  1.0451e-04, -1.0485e-03, -6.0410e-04,  1.6990e-04, -2.1997e-04, -3.8769e-04, -7.6898e-04, -3.2372e-04, -1.9783e-04,  5.2896e-05, -1.0586e-03, -7.8516e-04,  7.6867e-04, -8.5331e-05, -4.8158e-04, -4.5362e-05, -1.0770e-04,  6.6823e-04,  3.0765e-04,  3.3669e-04,  9.5677e-04, 1.0458e-03,  5.8129e-04,  3.3737e-04,  1.0816e-03,  7.0346e-04, 4.2378e-04,  4.3131e-04,  2.8095e-04,  1.2201e-03,  5.6121e-04, -1.1086e-04,  4.9908e-04,  1.5586e-04,  4.2046e-04, -2.8088e-04, -2.2462e-04, -1.5539e-04, -7.0126e-04, -2.8577e-04, -3.3693e-04, -1.2471e-04, -6.9104e-04, -1.2867e-03, -6.2651e-04, -2.5586e-04, -1.3201e-04, -9.4537e-04, -4.8438e-04,  4.1458e-04,  6.4109e-04, 1.0891e-04, -6.3764e-04,  4.5573e-04,  8.2974e-04,  3.2973e-06, -3.8274e-04, -2.0400e-04,  4.9922e-04,  2.1508e-04, -1.1009e-04, -3.9763e-05,  3.0576e-04,  3.1485e-05, -2.7574e-05,  3.3856e-04],
+            ],
+            device=torch_device,
+        )
+        # fmt: on
+        self.assertTrue(torch.allclose(waveform[0, :100], expected_waveform, atol=1e-4))
+        self.assertEqual(waveform.shape, (1, 52480))
--- a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the FastSpeech2Conformer tokenizer."""
+import unittest
+from transformers.models.fastspeech2_conformer import FastSpeech2ConformerTokenizer
+from transformers.testing_utils import require_g2p_en, slow
+from ...test_tokenization_common import TokenizerTesterMixin
+@require_g2p_en
+class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = FastSpeech2ConformerTokenizer
+    test_rust_tokenizer = False
+    def setUp(self):
+        super().setUp()
+        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        tokenizer.save_pretrained(self.tmpdirname)
+    def get_input_output_texts(self, tokenizer):
+        input_text = "this is a test"
+        output_text = "this is a test"
+        return input_text, output_text
+    # Custom `get_clean_sequence` since FastSpeech2ConformerTokenizer can't decode id -> string
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, **kwargs):  # max_length=20, min_length=5
+        input_text, output_text = self.get_input_output_texts(tokenizer)
+        ids = tokenizer.encode(output_text, add_special_tokens=False)
+        return output_text, ids
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<unk>"
+        token_id = 1
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+        self.assertEqual(vocab_keys[0], "<blank>")
+        self.assertEqual(vocab_keys[1], "<unk>")
+        self.assertEqual(vocab_keys[-4], "UH0")
+        self.assertEqual(vocab_keys[-2], "..")
+        self.assertEqual(vocab_keys[-1], "<sos/eos>")
+        self.assertEqual(len(vocab_keys), 78)
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 78)
+    @unittest.skip(
+        "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
+    )
+    def test_added_token_are_matched_longest_first(self):
+        pass
+    @unittest.skip(
+        "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
+    )
+    def test_added_tokens_do_lower_case(self):
+        pass
+    @unittest.skip(
+        "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
+    )
+    def test_tokenize_special_tokens(self):
+        pass
+    def test_full_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+        tokens = tokenizer.tokenize("This is a test")
+        ids = [9, 12, 6, 12, 11, 2, 4, 15, 6, 4, 77]
+        self.assertListEqual(tokens, ["DH", "IH1", "S", "IH1", "Z", "AH0", "T", "EH1", "S", "T", "<sos/eos>"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), ids)
+        self.assertListEqual(tokenizer.convert_ids_to_tokens(ids), tokens)
+    @slow
+    def test_tokenizer_integration(self):
+        # Custom test since:
+        # 1) This tokenizer only decodes to tokens (phonemes cannot be converted to text with complete accuracy)
+        # 2) Uses a sequence without numbers since espnet has different, custom number conversion.
+        # This tokenizer can phonemize numbers, but where in espnet "32" is phonemized as "thirty two",
+        # here "32" is phonemized as "thirty-two" because we haven't implemented the custom number handling.
+        sequences = [
+            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
+            "general-purpose architectures (BERT, GPT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
+            "Language Understanding (NLU) and Natural Language Generation (NLG) with over thirty-two pretrained "
+            "models in one hundred plus languages and deep interoperability between Jax, PyTorch and TensorFlow.",
+            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+            "conditioning on both left and right context in all layers.",
+            "The quick brown fox jumps over the lazy dog.",
+        ]
+        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained(
+            "espnet/fastspeech2_conformer", revision="07f9c4a2d6bbc69b277d87d2202ad1e35b05e113"
+        )
+        actual_encoding = tokenizer(sequences)
+        # fmt: off
+        expected_encoding = {
+            'input_ids': [
+                [4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 22, 30, 7, 14, 21, 8, 29, 3, 34, 3, 18, 11, 17, 12, 4, 21, 10, 4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 2, 3, 5, 17, 12, 4, 21, 10, 17, 7, 29, 4, 7, 31, 3, 5, 25, 38, 4, 17, 7, 2, 20, 32, 5, 11, 40, 15, 3, 21, 2, 8, 17, 38, 17, 2, 6, 24, 7, 10, 2, 4, 45, 10, 39, 21, 11, 25, 38, 4, 23, 37, 15, 4, 6, 23, 7, 2, 25, 38, 4, 2, 23, 11, 8, 15, 14, 11, 23, 5, 13, 6, 4, 12, 8, 4, 21, 25, 23, 11, 8, 15, 3, 39, 2, 8, 1, 22, 30, 7, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 62, 3, 5, 21, 6, 4, 18, 3, 5, 13, 36, 3, 8, 28, 2, 3, 5, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 40, 45, 3, 21, 31, 35, 2, 3, 15, 8, 36, 16, 12, 9, 34, 20, 21, 43, 38, 5, 29, 4, 28, 17, 7, 29, 4, 7, 31, 3, 5, 14, 24, 5, 2, 8, 11, 13, 3, 16, 19, 3, 26, 19, 3, 5, 7, 2, 5, 17, 8, 19, 6, 8, 18, 36, 37, 16, 2, 40, 2, 11, 2, 3, 5, 5, 27, 17, 49, 3, 4, 21, 2, 17, 21, 25, 12, 8, 2, 4, 29, 25, 13, 4, 16, 27, 3, 40, 18, 10, 6, 23, 17, 12, 4, 21, 10, 2, 3, 5, 4, 15, 3, 6, 21, 8, 46, 22, 33, 77],
+                [25, 38, 4, 12, 11, 5, 13, 11, 32, 3, 5, 4, 28, 17, 7, 27, 4, 7, 31, 3, 5, 27, 17, 25, 51, 5, 13, 7, 15, 10, 35, 2, 3, 2, 8, 7, 45, 17, 7, 2, 11, 2, 3, 4, 31, 35, 2, 3, 11, 22, 7, 19, 14, 2, 3, 8, 31, 25, 2, 8, 5, 4, 15, 10, 6, 4, 25, 32, 40, 55, 3, 4, 8, 29, 10, 2, 3, 5, 12, 35, 2, 3, 13, 36, 24, 3, 25, 34, 43, 8, 15, 22, 4, 2, 3, 5, 7, 32, 4, 10, 24, 3, 4, 54, 10, 6, 4, 13, 3, 30, 8, 8, 31, 21, 11, 33, 77],
+                [9, 2, 10, 16, 12, 10, 25, 7, 42, 3, 22, 24, 10, 6, 40, 19, 14, 17, 6, 34, 20, 21, 9, 2, 8, 31, 11, 29, 5, 30, 37, 33, 77]
+            ],
+            'attention_mask': [
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+            ]
+        }
+        # fmt: on
+        actual_tokens = [tokenizer.decode(input_ids) for input_ids in expected_encoding["input_ids"]]
+        expected_tokens = [
+            [tokenizer.convert_ids_to_tokens(id) for id in sequence] for sequence in expected_encoding["input_ids"]
+        ]
+        self.assertListEqual(actual_encoding["input_ids"], expected_encoding["input_ids"])
+        self.assertListEqual(actual_encoding["attention_mask"], expected_encoding["attention_mask"])
+        self.assertTrue(actual_tokens == expected_tokens)
+    @unittest.skip(
+        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
+    )
+    def test_add_tokens_tokenizer(self):
+        pass
+    @unittest.skip(
+        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
+    )
+    def test_add_special_tokens(self):
+        pass
+    @unittest.skip(
+        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
+    )
+    def test_added_token_serializable(self):
+        pass
+    @unittest.skip(
+        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
+    )
+    def test_save_and_load_tokenizer(self):
+        pass
+    @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
+    def test_internal_consistency(self):
+        pass
+    @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
+    def test_encode_decode_with_spaces(self):
+        pass
+    @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
+    def test_convert_tokens_to_string_format(self):
+        pass
+    @unittest.skip("FastSpeech2Conformer tokenizer does not support pairs.")
+    def test_maximum_encoding_length_pair_input(self):
+        pass
+    @unittest.skip(
+        "FastSpeech2Conformer tokenizer appends eos_token to each string it's passed, including `is_split_into_words=True`."
+    )
+    def test_pretokenized_inputs(self):
+        pass
+    @unittest.skip(
+        reason="g2p_en is slow is with large inputs and max encoding length is not a concern for FastSpeech2Conformer"
+    )
+    def test_maximum_encoding_length_single_input(self):
+        pass
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -123,6 +123,7 @@ SPECIAL_CASES_TO_ALLOW.update(
        "DinatConfig": True,
        "DonutSwinConfig": True,
        "EfficientFormerConfig": True,
+        "FastSpeech2ConformerConfig": True,
        "FSMTConfig": True,
        "JukeboxConfig": True,
        "LayoutLMv2Config": True,

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -90,6 +90,8 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
    "UMT5EncoderModel",  # Building part of bigger (tested) model.
    "Blip2QFormerModel",  # Building part of bigger (tested) model.
    "ErnieMForInformationExtraction",
+    "FastSpeech2ConformerHifiGan",  # Already tested by SpeechT5HifiGan (# Copied from)
+    "FastSpeech2ConformerWithHifiGan",  # Built with two smaller (tested) models.
    "GraphormerDecoderHead",  # Building part of bigger (tested) model.
    "JukeboxVQVAE",  # Building part of bigger (tested) model.
    "JukeboxPrior",  # Building part of bigger (tested) model.
@@ -159,6 +161,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "Blip2QFormerModel",
    "Blip2VisionModel",
    "ErnieMForInformationExtraction",
+    "FastSpeech2ConformerHifiGan",
+    "FastSpeech2ConformerWithHifiGan",
    "GitVisionModel",
    "GraphormerModel",
    "GraphormerForGraphClassification",