Unverified Commit d83ff5ee authored by Connor Henderson's avatar Connor Henderson Committed by GitHub
Browse files

Add FastSpeech2Conformer (#23439)

* start - docs, SpeechT5 copy and rename

* add relevant code from FastSpeech2 draft, have tests pass

* make it an actual conformer, demo ex.

* matching inference with original repo, includes debug code

* refactor nn.Sequentials, start more desc. var names

* more renaming

* more renaming

* vocoder scratchwork

* matching vocoder outputs

* hifigan vocoder conversion script

* convert model script, rename some config vars

* replace postnet with speecht5's implementation

* passing common tests, file cleanup

* expand testing, add output hidden states and attention

* tokenizer + passing tokenizer tests

* variety of updates and tests

* g2p_en pckg setup

* import structure edits

* docstrings and cleanup

* repo consistency

* deps

* small cleanup

* forward signature param order

* address comments except for masks and labels

* address comments on attention_mask and labels

* address second round of comments

* remove old unneeded line

* address comments part 1

* address comments pt 2

* rename auto mapping

* fixes for failing tests

* address comments part 3 (bart-like, train loss)

* make style

* pass config where possible

* add forward method + tests to WithHifiGan model

* make style

* address arg passing and generate_speech comments

* address Arthur comments

* address Arthur comments pt2

* lint  changes

* Sanchit comment

* add g2p-en to doctest deps

* move up self.encoder

* onnx compatible tensor method

* fix is symbolic

* fix paper url

* move models to espnet org

* make style

* make fix-copies

* update docstring

* Arthur comments

* update docstring w/ new updates

* add model architecture images

* header size

* md wording update

* make style
parent 6eba901d
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" FastSpeech2Conformer model configuration"""
from typing import Dict
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/config.json",
}
FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"espnet/fastspeech2_conformer_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_hifigan/raw/main/config.json",
}
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"espnet/fastspeech2_conformer_with_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_with_hifigan/raw/main/config.json",
}
class FastSpeech2ConformerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to
instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the
FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 384):
The dimensionality of the hidden layers.
vocab_size (`int`, *optional*, defaults to 78):
The size of the vocabulary.
num_mel_bins (`int`, *optional*, defaults to 80):
The number of mel filters used in the filter bank.
encoder_num_attention_heads (`int`, *optional*, defaults to 2):
The number of attention heads in the encoder.
encoder_layers (`int`, *optional*, defaults to 4):
The number of layers in the encoder.
encoder_linear_units (`int`, *optional*, defaults to 1536):
The number of units in the linear layer of the encoder.
decoder_layers (`int`, *optional*, defaults to 4):
The number of layers in the decoder.
decoder_num_attention_heads (`int`, *optional*, defaults to 2):
The number of attention heads in the decoder.
decoder_linear_units (`int`, *optional*, defaults to 1536):
The number of units in the linear layer of the decoder.
speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
The number of layers in the post-net of the speech decoder.
speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
The number of units in the post-net layers of the speech decoder.
speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
The kernel size in the post-net of the speech decoder.
positionwise_conv_kernel_size (`int`, *optional*, defaults to 3):
The size of the convolution kernel used in the position-wise layer.
encoder_normalize_before (`bool`, *optional*, defaults to `False`):
Specifies whether to normalize before encoder layers.
decoder_normalize_before (`bool`, *optional*, defaults to `False`):
Specifies whether to normalize before decoder layers.
encoder_concat_after (`bool`, *optional*, defaults to `False`):
Specifies whether to concatenate after encoder layers.
decoder_concat_after (`bool`, *optional*, defaults to `False`):
Specifies whether to concatenate after decoder layers.
reduction_factor (`int`, *optional*, defaults to 1):
The factor by which the speech frame rate is reduced.
speaking_speed (`float`, *optional*, defaults to 1.0):
The speed of the speech produced.
use_macaron_style_in_conformer (`bool`, *optional*, defaults to `True`):
Specifies whether to use macaron style in the conformer.
use_cnn_in_conformer (`bool`, *optional*, defaults to `True`):
Specifies whether to use convolutional neural networks in the conformer.
encoder_kernel_size (`int`, *optional*, defaults to 7):
The kernel size used in the encoder.
decoder_kernel_size (`int`, *optional*, defaults to 31):
The kernel size used in the decoder.
duration_predictor_layers (`int`, *optional*, defaults to 2):
The number of layers in the duration predictor.
duration_predictor_channels (`int`, *optional*, defaults to 256):
The number of channels in the duration predictor.
duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
The kernel size used in the duration predictor.
energy_predictor_layers (`int`, *optional*, defaults to 2):
The number of layers in the energy predictor.
energy_predictor_channels (`int`, *optional*, defaults to 256):
The number of channels in the energy predictor.
energy_predictor_kernel_size (`int`, *optional*, defaults to 3):
The kernel size used in the energy predictor.
energy_predictor_dropout (`float`, *optional*, defaults to 0.5):
The dropout rate in the energy predictor.
energy_embed_kernel_size (`int`, *optional*, defaults to 1):
The kernel size used in the energy embed layer.
energy_embed_dropout (`float`, *optional*, defaults to 0.0):
The dropout rate in the energy embed layer.
stop_gradient_from_energy_predictor (`bool`, *optional*, defaults to `False`):
Specifies whether to stop gradients from the energy predictor.
pitch_predictor_layers (`int`, *optional*, defaults to 5):
The number of layers in the pitch predictor.
pitch_predictor_channels (`int`, *optional*, defaults to 256):
The number of channels in the pitch predictor.
pitch_predictor_kernel_size (`int`, *optional*, defaults to 5):
The kernel size used in the pitch predictor.
pitch_predictor_dropout (`float`, *optional*, defaults to 0.5):
The dropout rate in the pitch predictor.
pitch_embed_kernel_size (`int`, *optional*, defaults to 1):
The kernel size used in the pitch embed layer.
pitch_embed_dropout (`float`, *optional*, defaults to 0.0):
The dropout rate in the pitch embed layer.
stop_gradient_from_pitch_predictor (`bool`, *optional*, defaults to `True`):
Specifies whether to stop gradients from the pitch predictor.
encoder_dropout_rate (`float`, *optional*, defaults to 0.2):
The dropout rate in the encoder.
encoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
The positional dropout rate in the encoder.
encoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
The attention dropout rate in the encoder.
decoder_dropout_rate (`float`, *optional*, defaults to 0.2):
The dropout rate in the decoder.
decoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
The positional dropout rate in the decoder.
decoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
The attention dropout rate in the decoder.
duration_predictor_dropout_rate (`float`, *optional*, defaults to 0.2):
The dropout rate in the duration predictor.
speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
The dropout rate in the speech decoder postnet.
max_source_positions (`int`, *optional*, defaults to 5000):
if `"relative"` position embeddings are used, defines the maximum source input positions.
use_masking (`bool`, *optional*, defaults to `True`):
Specifies whether to use masking in the model.
use_weighted_masking (`bool`, *optional*, defaults to `False`):
Specifies whether to use weighted masking in the model.
num_speakers (`int`, *optional*):
Number of speakers. If set to > 1, assume that the speaker ids will be provided as the input and use
speaker id embedding layer.
num_languages (`int`, *optional*):
Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
languge id embedding layer.
speaker_embed_dim (`int`, *optional*):
Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Specifies whether the model is an encoder-decoder.
Example:
```python
>>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig
>>> # Initializing a FastSpeech2Conformer style configuration
>>> configuration = FastSpeech2ConformerConfig()
>>> # Initializing a model from the FastSpeech2Conformer style configuration
>>> model = FastSpeech2ConformerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "fastspeech2_conformer"
attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
def __init__(
self,
hidden_size=384,
vocab_size=78,
num_mel_bins=80,
encoder_num_attention_heads=2,
encoder_layers=4,
encoder_linear_units=1536,
decoder_layers=4,
decoder_num_attention_heads=2,
decoder_linear_units=1536,
speech_decoder_postnet_layers=5,
speech_decoder_postnet_units=256,
speech_decoder_postnet_kernel=5,
positionwise_conv_kernel_size=3,
encoder_normalize_before=False,
decoder_normalize_before=False,
encoder_concat_after=False,
decoder_concat_after=False,
reduction_factor=1,
speaking_speed=1.0,
use_macaron_style_in_conformer=True,
use_cnn_in_conformer=True,
encoder_kernel_size=7,
decoder_kernel_size=31,
duration_predictor_layers=2,
duration_predictor_channels=256,
duration_predictor_kernel_size=3,
energy_predictor_layers=2,
energy_predictor_channels=256,
energy_predictor_kernel_size=3,
energy_predictor_dropout=0.5,
energy_embed_kernel_size=1,
energy_embed_dropout=0.0,
stop_gradient_from_energy_predictor=False,
pitch_predictor_layers=5,
pitch_predictor_channels=256,
pitch_predictor_kernel_size=5,
pitch_predictor_dropout=0.5,
pitch_embed_kernel_size=1,
pitch_embed_dropout=0.0,
stop_gradient_from_pitch_predictor=True,
encoder_dropout_rate=0.2,
encoder_positional_dropout_rate=0.2,
encoder_attention_dropout_rate=0.2,
decoder_dropout_rate=0.2,
decoder_positional_dropout_rate=0.2,
decoder_attention_dropout_rate=0.2,
duration_predictor_dropout_rate=0.2,
speech_decoder_postnet_dropout=0.5,
max_source_positions=5000,
use_masking=True,
use_weighted_masking=False,
num_speakers=None,
num_languages=None,
speaker_embed_dim=None,
is_encoder_decoder=True,
**kwargs,
):
if positionwise_conv_kernel_size % 2 == 0:
raise ValueError(
f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead."
)
if encoder_kernel_size % 2 == 0:
raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.")
if decoder_kernel_size % 2 == 0:
raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.")
if duration_predictor_kernel_size % 2 == 0:
raise ValueError(
f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead."
)
if energy_predictor_kernel_size % 2 == 0:
raise ValueError(
f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead."
)
if energy_embed_kernel_size % 2 == 0:
raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.")
if pitch_predictor_kernel_size % 2 == 0:
raise ValueError(
f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead."
)
if pitch_embed_kernel_size % 2 == 0:
raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.")
if hidden_size % encoder_num_attention_heads != 0:
raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.")
if hidden_size % decoder_num_attention_heads != 0:
raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.")
if use_masking and use_weighted_masking:
raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.num_mel_bins = num_mel_bins
self.encoder_config = {
"num_attention_heads": encoder_num_attention_heads,
"layers": encoder_layers,
"kernel_size": encoder_kernel_size,
"attention_dropout_rate": encoder_attention_dropout_rate,
"dropout_rate": encoder_dropout_rate,
"positional_dropout_rate": encoder_positional_dropout_rate,
"linear_units": encoder_linear_units,
"normalize_before": encoder_normalize_before,
"concat_after": encoder_concat_after,
}
self.decoder_config = {
"num_attention_heads": decoder_num_attention_heads,
"layers": decoder_layers,
"kernel_size": decoder_kernel_size,
"attention_dropout_rate": decoder_attention_dropout_rate,
"dropout_rate": decoder_dropout_rate,
"positional_dropout_rate": decoder_positional_dropout_rate,
"linear_units": decoder_linear_units,
"normalize_before": decoder_normalize_before,
"concat_after": decoder_concat_after,
}
self.encoder_num_attention_heads = encoder_num_attention_heads
self.encoder_layers = encoder_layers
self.duration_predictor_channels = duration_predictor_channels
self.duration_predictor_kernel_size = duration_predictor_kernel_size
self.duration_predictor_layers = duration_predictor_layers
self.energy_embed_dropout = energy_embed_dropout
self.energy_embed_kernel_size = energy_embed_kernel_size
self.energy_predictor_channels = energy_predictor_channels
self.energy_predictor_dropout = energy_predictor_dropout
self.energy_predictor_kernel_size = energy_predictor_kernel_size
self.energy_predictor_layers = energy_predictor_layers
self.pitch_embed_dropout = pitch_embed_dropout
self.pitch_embed_kernel_size = pitch_embed_kernel_size
self.pitch_predictor_channels = pitch_predictor_channels
self.pitch_predictor_dropout = pitch_predictor_dropout
self.pitch_predictor_kernel_size = pitch_predictor_kernel_size
self.pitch_predictor_layers = pitch_predictor_layers
self.positionwise_conv_kernel_size = positionwise_conv_kernel_size
self.speech_decoder_postnet_units = speech_decoder_postnet_units
self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout
self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel
self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
self.reduction_factor = reduction_factor
self.speaking_speed = speaking_speed
self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
self.max_source_positions = max_source_positions
self.use_cnn_in_conformer = use_cnn_in_conformer
self.use_macaron_style_in_conformer = use_macaron_style_in_conformer
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
self.num_speakers = num_speakers
self.num_languages = num_languages
self.speaker_embed_dim = speaker_embed_dim
self.duration_predictor_dropout_rate = duration_predictor_dropout_rate
self.is_encoder_decoder = is_encoder_decoder
super().__init__(
is_encoder_decoder=is_encoder_decoder,
**kwargs,
)
class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to
instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
FastSpeech2Conformer
[espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
model_in_dim (`int`, *optional*, defaults to 80):
The number of frequency bins in the input log-mel spectrogram.
upsample_initial_channel (`int`, *optional*, defaults to 512):
The number of input channels into the upsampling network.
upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
length of *upsample_rates* defines the number of convolutional layers and has to match the length of
*upsample_kernel_sizes*.
upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
*upsample_rates*.
resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
fusion (MRF) module.
resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
multi-receptive field fusion (MRF) module.
initializer_range (`float`, *optional*, defaults to 0.01):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
leaky_relu_slope (`float`, *optional*, defaults to 0.1):
The angle of the negative slope used by the leaky ReLU activation.
normalize_before (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
Example:
```python
>>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig
>>> # Initializing a FastSpeech2ConformerHifiGan configuration
>>> configuration = FastSpeech2ConformerHifiGanConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = FastSpeech2ConformerHifiGan(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "hifigan"
def __init__(
self,
model_in_dim=80,
upsample_initial_channel=512,
upsample_rates=[8, 8, 2, 2],
upsample_kernel_sizes=[16, 16, 4, 4],
resblock_kernel_sizes=[3, 7, 11],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
initializer_range=0.01,
leaky_relu_slope=0.1,
normalize_before=True,
**kwargs,
):
self.model_in_dim = model_in_dim
self.upsample_initial_channel = upsample_initial_channel
self.upsample_rates = upsample_rates
self.upsample_kernel_sizes = upsample_kernel_sizes
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.initializer_range = initializer_range
self.leaky_relu_slope = leaky_relu_slope
self.normalize_before = normalize_before
super().__init__(**kwargs)
class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to
instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations,
defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the
FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and
FastSpeech2ConformerHifiGan
[espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
model_config (`typing.Dict`, *optional*):
Configuration of the text-to-speech model.
vocoder_config (`typing.Dict`, *optional*):
Configuration of the vocoder model.
model_config ([`FastSpeech2ConformerConfig`], *optional*):
Configuration of the text-to-speech model.
vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*):
Configuration of the vocoder model.
Example:
```python
>>> from transformers import (
... FastSpeech2ConformerConfig,
... FastSpeech2ConformerHifiGanConfig,
... FastSpeech2ConformerWithHifiGanConfig,
... FastSpeech2ConformerWithHifiGan,
... )
>>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations.
>>> model_config = FastSpeech2ConformerConfig()
>>> vocoder_config = FastSpeech2ConformerHifiGanConfig()
>>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration
>>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict())
>>> # Initializing a model (with random weights)
>>> model = FastSpeech2ConformerWithHifiGan(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "fastspeech2_conformer_with_hifigan"
is_composition = True
def __init__(
self,
model_config: Dict = None,
vocoder_config: Dict = None,
**kwargs,
):
if model_config is None:
model_config = {}
logger.info("model_config is None. initializing the model with default values.")
if vocoder_config is None:
vocoder_config = {}
logger.info("vocoder_config is None. initializing the coarse model with default values.")
self.model_config = FastSpeech2ConformerConfig(**model_config)
self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config)
super().__init__(**kwargs)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert FastSpeech2Conformer checkpoint."""
import argparse
import json
import re
from pathlib import Path
from tempfile import TemporaryDirectory
import torch
import yaml
from transformers import (
FastSpeech2ConformerConfig,
FastSpeech2ConformerModel,
FastSpeech2ConformerTokenizer,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
CONFIG_MAPPING = {
"adim": "hidden_size",
"aheads": "num_attention_heads",
"conformer_dec_kernel_size": "decoder_kernel_size",
"conformer_enc_kernel_size": "encoder_kernel_size",
"decoder_normalize_before": "decoder_normalize_before",
"dlayers": "decoder_layers",
"dunits": "decoder_linear_units",
"duration_predictor_chans": "duration_predictor_channels",
"duration_predictor_kernel_size": "duration_predictor_kernel_size",
"duration_predictor_layers": "duration_predictor_layers",
"elayers": "encoder_layers",
"encoder_normalize_before": "encoder_normalize_before",
"energy_embed_dropout": "energy_embed_dropout",
"energy_embed_kernel_size": "energy_embed_kernel_size",
"energy_predictor_chans": "energy_predictor_channels",
"energy_predictor_dropout": "energy_predictor_dropout",
"energy_predictor_kernel_size": "energy_predictor_kernel_size",
"energy_predictor_layers": "energy_predictor_layers",
"eunits": "encoder_linear_units",
"pitch_embed_dropout": "pitch_embed_dropout",
"pitch_embed_kernel_size": "pitch_embed_kernel_size",
"pitch_predictor_chans": "pitch_predictor_channels",
"pitch_predictor_dropout": "pitch_predictor_dropout",
"pitch_predictor_kernel_size": "pitch_predictor_kernel_size",
"pitch_predictor_layers": "pitch_predictor_layers",
"positionwise_conv_kernel_size": "positionwise_conv_kernel_size",
"postnet_chans": "speech_decoder_postnet_units",
"postnet_filts": "speech_decoder_postnet_kernel",
"postnet_layers": "speech_decoder_postnet_layers",
"reduction_factor": "reduction_factor",
"stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor",
"stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor",
"transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate",
"transformer_dec_dropout_rate": "decoder_dropout_rate",
"transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate",
"transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate",
"transformer_enc_dropout_rate": "encoder_dropout_rate",
"transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate",
"use_cnn_in_conformer": "use_cnn_in_conformer",
"use_macaron_style_in_conformer": "use_macaron_style_in_conformer",
"use_masking": "use_masking",
"use_weighted_masking": "use_weighted_masking",
"idim": "input_dim",
"odim": "num_mel_bins",
"spk_embed_dim": "speaker_embed_dim",
"langs": "num_languages",
"spks": "num_speakers",
}
def remap_model_yaml_config(yaml_config_path):
with Path(yaml_config_path).open("r", encoding="utf-8") as f:
args = yaml.safe_load(f)
args = argparse.Namespace(**args)
remapped_config = {}
model_params = args.tts_conf["text2mel_params"]
# espnet_config_key -> hf_config_key, any keys not included are ignored
for espnet_config_key, hf_config_key in CONFIG_MAPPING.items():
if espnet_config_key in model_params:
remapped_config[hf_config_key] = model_params[espnet_config_key]
return remapped_config, args.g2p, args.token_list
def convert_espnet_state_dict_to_hf(state_dict):
new_state_dict = {}
for key in state_dict:
if "tts.generator.text2mel." in key:
new_key = key.replace("tts.generator.text2mel.", "")
if "postnet" in key:
new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers")
new_key = new_key.replace(".0.weight", ".conv.weight")
new_key = new_key.replace(".1.weight", ".batch_norm.weight")
new_key = new_key.replace(".1.bias", ".batch_norm.bias")
new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean")
new_key = new_key.replace(".1.running_var", ".batch_norm.running_var")
new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked")
if "feat_out" in key:
if "weight" in key:
new_key = "speech_decoder_postnet.feat_out.weight"
if "bias" in key:
new_key = "speech_decoder_postnet.feat_out.bias"
if "encoder.embed.0.weight" in key:
new_key = new_key.replace("0.", "")
if "w_1" in key:
new_key = new_key.replace("w_1", "conv1")
if "w_2" in key:
new_key = new_key.replace("w_2", "conv2")
if "predictor.conv" in key:
new_key = new_key.replace(".conv", ".conv_layers")
pattern = r"(\d)\.(\d)"
replacement = (
r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm"
)
new_key = re.sub(pattern, replacement, new_key)
if "pitch_embed" in key or "energy_embed" in key:
new_key = new_key.replace("0", "conv")
if "encoders" in key:
new_key = new_key.replace("encoders", "conformer_layers")
new_key = new_key.replace("norm_final", "final_layer_norm")
new_key = new_key.replace("norm_mha", "self_attn_layer_norm")
new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm")
new_key = new_key.replace("norm_ff", "ff_layer_norm")
new_key = new_key.replace("norm_conv", "conv_layer_norm")
if "lid_emb" in key:
new_key = new_key.replace("lid_emb", "language_id_embedding")
if "sid_emb" in key:
new_key = new_key.replace("sid_emb", "speaker_id_embedding")
new_state_dict[new_key] = state_dict[key]
return new_state_dict
@torch.no_grad()
def convert_FastSpeech2ConformerModel_checkpoint(
checkpoint_path,
yaml_config_path,
pytorch_dump_folder_path,
repo_id=None,
):
model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path)
config = FastSpeech2ConformerConfig(**model_params)
# Prepare the model
model = FastSpeech2ConformerModel(config)
espnet_checkpoint = torch.load(checkpoint_path)
hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
model.load_state_dict(hf_compatible_state_dict)
model.save_pretrained(pytorch_dump_folder_path)
# Prepare the tokenizer
with TemporaryDirectory() as tempdir:
vocab = {token: id for id, token in enumerate(vocab)}
vocab_file = Path(tempdir) / "vocab.json"
with open(vocab_file, "w") as f:
json.dump(vocab, f)
should_strip_spaces = "no_space" in tokenizer_name
tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces)
tokenizer.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
parser.add_argument(
"--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
)
parser.add_argument(
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
args = parser.parse_args()
convert_FastSpeech2ConformerModel_checkpoint(
args.checkpoint_path,
args.yaml_config_path,
args.pytorch_dump_folder_path,
args.push_to_hub,
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert FastSpeech2Conformer HiFi-GAN checkpoint."""
import argparse
from pathlib import Path
import torch
import yaml
from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
def load_weights(checkpoint, hf_model, config):
vocoder_key_prefix = "tts.generator.vocoder."
checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k}
hf_model.apply_weight_norm()
hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
for i in range(len(config.upsample_rates)):
hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
for j in range(len(config.resblock_dilation_sizes)):
hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
hf_model.remove_weight_norm()
def remap_hifigan_yaml_config(yaml_config_path):
with Path(yaml_config_path).open("r", encoding="utf-8") as f:
args = yaml.safe_load(f)
args = argparse.Namespace(**args)
vocoder_type = args.tts_conf["vocoder_type"]
if vocoder_type != "hifigan_generator":
raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}")
remapped_dict = {}
vocoder_params = args.tts_conf["vocoder_params"]
# espnet_config_key -> hf_config_key
key_mappings = {
"channels": "upsample_initial_channel",
"in_channels": "model_in_dim",
"resblock_dilations": "resblock_dilation_sizes",
"resblock_kernel_sizes": "resblock_kernel_sizes",
"upsample_kernel_sizes": "upsample_kernel_sizes",
"upsample_scales": "upsample_rates",
}
for espnet_config_key, hf_config_key in key_mappings.items():
remapped_dict[hf_config_key] = vocoder_params[espnet_config_key]
remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"]
remapped_dict["normalize_before"] = False
remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"]
return remapped_dict
@torch.no_grad()
def convert_hifigan_checkpoint(
checkpoint_path,
pytorch_dump_folder_path,
yaml_config_path=None,
repo_id=None,
):
if yaml_config_path is not None:
config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
else:
config = FastSpeech2ConformerHifiGanConfig()
model = FastSpeech2ConformerHifiGan(config)
orig_checkpoint = torch.load(checkpoint_path)
load_weights(orig_checkpoint, model, config)
model.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert")
parser.add_argument(
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
args = parser.parse_args()
convert_hifigan_checkpoint(
args.checkpoint_path,
args.pytorch_dump_folder_path,
args.yaml_config_path,
args.push_to_hub,
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert FastSpeech2Conformer checkpoint."""
import argparse
import torch
from transformers import (
FastSpeech2ConformerConfig,
FastSpeech2ConformerHifiGan,
FastSpeech2ConformerHifiGanConfig,
FastSpeech2ConformerModel,
FastSpeech2ConformerWithHifiGan,
FastSpeech2ConformerWithHifiGanConfig,
logging,
)
from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import (
convert_espnet_state_dict_to_hf,
remap_model_yaml_config,
)
from .convert_hifigan import load_weights, remap_hifigan_yaml_config
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
def convert_FastSpeech2ConformerWithHifiGan_checkpoint(
checkpoint_path,
yaml_config_path,
pytorch_dump_folder_path,
repo_id=None,
):
# Prepare the model
model_params, *_ = remap_model_yaml_config(yaml_config_path)
model_config = FastSpeech2ConformerConfig(**model_params)
model = FastSpeech2ConformerModel(model_config)
espnet_checkpoint = torch.load(checkpoint_path)
hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
model.load_state_dict(hf_compatible_state_dict)
# Prepare the vocoder
config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
vocoder = FastSpeech2ConformerHifiGan(vocoder_config)
load_weights(espnet_checkpoint, vocoder, vocoder_config)
# Prepare the model + vocoder
config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config)
with_hifigan_model = FastSpeech2ConformerWithHifiGan(config)
with_hifigan_model.model = model
with_hifigan_model.vocoder = vocoder
with_hifigan_model.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
with_hifigan_model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
parser.add_argument(
"--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
)
parser.add_argument(
"--pytorch_dump_folder_path",
required=True,
default=None,
type=str,
help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
args = parser.parse_args()
convert_FastSpeech2ConformerWithHifiGan_checkpoint(
args.checkpoint_path,
args.yaml_config_path,
args.pytorch_dump_folder_path,
args.push_to_hub,
)
# coding=utf-8
# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch FastSpeech2Conformer model."""
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
from torch import nn
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import ModelOutput, add_start_docstrings, logging, replace_return_docstrings
from .configuration_fastspeech2_conformer import (
FastSpeech2ConformerConfig,
FastSpeech2ConformerHifiGanConfig,
FastSpeech2ConformerWithHifiGanConfig,
)
logger = logging.get_logger(__name__)
FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"espnet/fastspeech2_conformer",
# See all FastSpeech2Conformer models at https://huggingface.co/models?filter=fastspeech2_conformer
]
@dataclass
class FastSpeech2ConformerModelOutput(ModelOutput):
"""
Output type of [`FastSpeech2ConformerModel`].
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Spectrogram generation loss.
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
The predicted spectrogram.
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
Outputs of the duration predictor.
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
Outputs of the pitch predictor.
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
Outputs of the energy predictor.
"""
loss: Optional[torch.FloatTensor] = None
spectrogram: torch.FloatTensor = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
duration_outputs: torch.LongTensor = None
pitch_outputs: torch.FloatTensor = None
energy_outputs: torch.FloatTensor = None
@dataclass
class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
"""
Output type of [`FastSpeech2ConformerWithHifiGan`].
Args:
waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Speech output as a result of passing the predicted mel spectrogram through the vocoder.
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Spectrogram generation loss.
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
The predicted spectrogram.
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
Outputs of the duration predictor.
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
Outputs of the pitch predictor.
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
Outputs of the energy predictor.
"""
waveform: torch.FloatTensor = None
_CONFIG_FOR_DOC = "FastSpeech2ConformerConfig"
FASTSPEECH2_CONFORMER_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FastSpeech2ConformerConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
HIFIGAN_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FastSpeech2ConformerConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FastSpeech2ConformerWithHifiGanConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0):
"""
Length regulator for feed-forward Transformer.
This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech`
https://arxiv.org/pdf/1905.09263.pdf. The length regulator expands char or phoneme-level embedding features to
frame-level by repeating each feature based on the corresponding predicted durations.
Args:
encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`):
Batch of sequences of char or phoneme embeddings.
duration_labels (`torch.LongTensor` of shape `(batch_size, time)`):
Batch of durations of each frame.
speaking_speed (`float`, *optional*, defaults to 1.0):
Value to control speed of speech.
Returns:
`torch.Tensor`:
Replicated input tensor based on durations (batch_size, time*, embedding_dim).
"""
if speaking_speed <= 0:
raise ValueError("`speaking_speed` must be greater than 0.")
elif speaking_speed != 1.0:
duration_labels = torch.round(duration_labels.float() * speaking_speed).long()
if duration_labels.sum() == 0:
duration_labels[duration_labels.sum(dim=1).eq(0)] = 1
# Calculate the maximum length needed
max_len = torch.sum(duration_labels, dim=1).max()
# Create a padded tensor to hold the results
hidden_states = torch.zeros(
(encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)),
dtype=torch.float,
device=encoded_embeddings.device,
)
# Loop through the batch and fill in the data
for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)):
repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0)
hidden_states[i, : repeated.size(0)] = repeated
return hidden_states
class FastSpeech2ConformerDurationPredictor(nn.Module):
"""
Duration predictor module.
This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to
Speech' https://arxiv.org/pdf/1905.09263.pdf The duration predictor predicts a duration of each frame in log domain
from the hidden embeddings of encoder.
Note:
The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the
outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
"""
def __init__(self, config: FastSpeech2ConformerConfig):
super().__init__()
self.conv_layers = nn.ModuleList()
self.log_domain_offset = 1.0
for layer_idx in range(config.duration_predictor_layers):
num_chans = config.duration_predictor_channels
input_channels = config.hidden_size if layer_idx == 0 else num_chans
layer = FastSpeech2ConformerPredictorLayer(
input_channels,
num_chans,
config.duration_predictor_kernel_size,
config.duration_predictor_dropout_rate,
)
self.conv_layers.append(layer)
self.linear = nn.Linear(config.duration_predictor_channels, 1)
def forward(self, encoder_hidden_states):
"""
Args:
hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
Batch of input sequences.
padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
Batch of masks indicating padded part.
Returns:
`torch.Tensor`: Batch of predicted durations in log domain `(batch_size, max_text_length)`.
"""
# (batch_size, input_dim, max_text_length)
hidden_states = encoder_hidden_states.transpose(1, -1)
for layer in self.conv_layers:
hidden_states = layer(hidden_states)
# NOTE: calculate in log domain, (batch_size, max_text_length)
hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1)
if not self.training:
# NOTE: calculate in linear domain
hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long()
return hidden_states
# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer
class FastSpeech2ConformerBatchNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
if layer_id == 0:
in_conv_dim = config.num_mel_bins
else:
in_conv_dim = config.speech_decoder_postnet_units
if layer_id == config.speech_decoder_postnet_layers - 1:
out_conv_dim = config.num_mel_bins
else:
out_conv_dim = config.speech_decoder_postnet_units
self.conv = nn.Conv1d(
in_conv_dim,
out_conv_dim,
kernel_size=config.speech_decoder_postnet_kernel,
stride=1,
padding=(config.speech_decoder_postnet_kernel - 1) // 2,
bias=False,
)
self.batch_norm = nn.BatchNorm1d(out_conv_dim)
if layer_id < config.speech_decoder_postnet_layers - 1:
self.activation = nn.Tanh()
else:
self.activation = None
self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.batch_norm(hidden_states)
if self.activation is not None:
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
self.layers = nn.ModuleList(
[FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
)
def forward(self, hidden_states: torch.Tensor):
outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
layer_output = outputs_before_postnet.transpose(1, 2)
for layer in self.layers:
layer_output = layer(layer_output)
outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2)
return outputs_before_postnet, outputs_after_postnet
class FastSpeech2ConformerPredictorLayer(nn.Module):
def __init__(self, input_channels, num_chans, kernel_size, dropout_rate):
super().__init__()
self.conv = nn.Conv1d(
input_channels,
num_chans,
kernel_size,
stride=1,
padding=(kernel_size - 1) // 2,
)
self.activation = nn.ReLU()
self.layer_norm = nn.LayerNorm(num_chans)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.activation(hidden_states)
# Perform layer norm on dimension 1
hidden_states = hidden_states.transpose(1, -1)
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states.transpose(1, -1)
hidden_states = self.dropout(hidden_states)
return hidden_states
class FastSpeech2ConformerVariancePredictor(nn.Module):
def __init__(
self,
config: FastSpeech2ConformerConfig,
num_layers=2,
num_chans=384,
kernel_size=3,
dropout_rate=0.5,
):
"""
Initilize variance predictor module.
Args:
input_dim (`int`): Input dimension.
num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers.
num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers.
kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers.
dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate.
"""
super().__init__()
self.conv_layers = nn.ModuleList()
for idx in range(num_layers):
input_channels = config.hidden_size if idx == 0 else num_chans
layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate)
self.conv_layers.append(layer)
self.linear = nn.Linear(num_chans, 1)
def forward(self, encoder_hidden_states, padding_masks=None):
"""
Calculate forward propagation.
Args:
encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
Batch of input sequences.
padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
Batch of masks indicating padded part.
Returns:
Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`.
"""
# (batch_size, input_dim, max_text_length)
hidden_states = encoder_hidden_states.transpose(1, -1)
for layer in self.conv_layers:
hidden_states = layer(hidden_states)
hidden_states = self.linear(hidden_states.transpose(1, 2))
if padding_masks is not None:
hidden_states = hidden_states.masked_fill(padding_masks, 0.0)
return hidden_states
class FastSpeech2ConformerVarianceEmbedding(nn.Module):
def __init__(
self,
in_channels=1,
out_channels=384,
kernel_size=1,
padding=0,
dropout_rate=0.0,
):
super().__init__()
self.conv = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
padding=padding,
)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, hidden_states):
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.conv(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
class FastSpeech2ConformerAttention(nn.Module):
"""
Multi-Head attention layer with relative position encoding. Details can be found in
https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860.
"""
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
"""Construct an FastSpeech2ConformerAttention object."""
super().__init__()
# We assume d_v always equals dim_key
self.num_heads = module_config["num_attention_heads"]
self.hidden_size = config.hidden_size
self.dim_key = self.hidden_size // self.num_heads
self.head_dim = self.hidden_size // self.num_heads
self.linear_q = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_k = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_v = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_out = nn.Linear(self.hidden_size, self.hidden_size)
self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"])
# linear transformation for positional encoding
self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
def shift_relative_position_tensor(self, pos_tensor):
"""
Args:
pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor.
"""
zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype)
pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1)
pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2))
# only keep the positions from 0 to time2
pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1]
return pos_tensor
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
pos_emb: Optional[torch.Tensor] = None,
output_attentions: Optional[torch.Tensor] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args:
hidden_states (`torch.Tensor` of shape `(batch, time2, size)`): Values of the hidden states
attention_mask (`torch.Tensor` of shape `(batch, time1, time2)`): Mask tensor.
pos_emb (`torch.Tensor` of shape `(batch, 2*time1-1, size)`): Positional embedding tensor.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
Returns:
`torch.Tensor`: Output tensor of shape `(batch, time1, d_model)`.
"""
bsz, q_len, _ = hidden_states.size()
query_states = self.linear_q(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
key_states = self.linear_k(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
value_states = self.linear_v(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
bsz_pos = pos_emb.size(0)
pos_encoding = self.linear_pos(pos_emb).view(bsz_pos, -1, self.num_heads, self.head_dim)
# (batch_size, head, time1, dim_key)
query_with_bias_u = (query_states + self.pos_bias_u).transpose(1, 2)
# (batch_size, head, time1, dim_key)
query_with_bias_v = (query_states + self.pos_bias_v).transpose(1, 2)
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
# (batch_size, head, time1, time2)
matrix_ac = torch.matmul(query_with_bias_u, key_states.permute(0, 2, 3, 1))
# compute matrix b and matrix d
# (batch_size, head, time1, 2*time1-1)
matrix_bd = torch.matmul(query_with_bias_v, pos_encoding.permute(0, 2, 3, 1))
matrix_bd = self.shift_relative_position_tensor(matrix_bd)
# (batch_size, head, time1, time2)
scores = (matrix_ac + matrix_bd) / math.sqrt(self.dim_key)
# Forward attention
if attention_mask is not None:
expected_size = (bsz, 1, q_len)
if attention_mask.size() != expected_size:
raise ValueError(f"Attention mask should be of size {expected_size}, but is {attention_mask.size()}")
attention_mask = attention_mask.unsqueeze(1).eq(0)
min_value = float(torch.finfo(scores.dtype).min)
scores = scores.masked_fill(attention_mask, min_value)
attn_weights = torch.softmax(scores, dim=-1).masked_fill(attention_mask, 0.0)
else:
attn_weights = torch.softmax(scores, dim=-1)
attn_weights = self.dropout(attn_weights)
attn_output = torch.matmul(attn_weights, value_states.transpose(1, 2))
attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1)
attn_output = self.linear_out(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights
class FastSpeech2ConformerConvolutionModule(nn.Module):
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
super().__init__()
# kernel_size should be an odd number for 'SAME' padding
channels = config.hidden_size
kernel_size = module_config["kernel_size"]
self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
self.depthwise_conv = nn.Conv1d(
channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True
)
self.norm = nn.BatchNorm1d(channels)
self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)
def forward(self, hidden_states):
"""
Compute convolution module.
Args:
hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
Returns:
`torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
"""
# exchange the temporal dimension and the feature dimension
hidden_states = hidden_states.transpose(1, 2)
# GLU mechanism, (batch_size, 2*channel, dim)
hidden_states = self.pointwise_conv1(hidden_states)
# (batch_size, channel, dim)
hidden_states = nn.functional.glu(hidden_states, dim=1)
# 1D Depthwise Conv
hidden_states = self.depthwise_conv(hidden_states)
hidden_states = self.norm(hidden_states)
hidden_states = hidden_states * torch.sigmoid(hidden_states)
hidden_states = self.pointwise_conv2(hidden_states)
return hidden_states.transpose(1, 2)
class FastSpeech2ConformerEncoderLayer(nn.Module):
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
super().__init__()
# self-attention module definition
self.self_attn = FastSpeech2ConformerAttention(config, module_config)
# feed-forward module definition
self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
self.macaron_style = config.use_macaron_style_in_conformer
if self.macaron_style:
self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size)
self.ff_scale = 0.5
else:
self.ff_scale = 1.0
# convolution module definition
self.use_cnn_module = config.use_cnn_in_conformer
if self.use_cnn_module:
self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config)
self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
self.final_layer_norm = nn.LayerNorm(config.hidden_size)
self.ff_layer_norm = nn.LayerNorm(config.hidden_size)
self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
self.dropout = nn.Dropout(module_config["dropout_rate"])
self.size = config.hidden_size
self.normalize_before = module_config["normalize_before"]
self.concat_after = module_config["concat_after"]
if self.concat_after:
self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size)
def forward(
self,
hidden_states: torch.Tensor,
pos_emb: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[torch.Tensor] = False,
):
"""
Compute encoded features.
Args:
hidden_states (`torch.Tensor` of shape `(batch, time, size)`): Input tensor.
pos_emb (`torch.Tensor` of shape `(1, time, size)`): Positional embeddings tensor.
attention_mask (`torch.Tensor` of shape `(batch, time)`): Attention mask tensor for the input.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
Returns:
`torch.Tensor`: Output tensor of shape `(batch, time, size)`.
"""
# whether to use macaron style
if self.macaron_style:
residual = hidden_states
if self.normalize_before:
hidden_states = self.ff_macaron_layer_norm(hidden_states)
hidden_states = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(hidden_states))
if not self.normalize_before:
hidden_states = self.ff_macaron_layer_norm(hidden_states)
# multi-headed self-attention module
residual = hidden_states
if self.normalize_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
attention_output, attention_scores = self.self_attn(
hidden_states, attention_mask=attention_mask, pos_emb=pos_emb, output_attentions=output_attentions
)
if self.concat_after:
x_concat = torch.cat((hidden_states, attention_output), dim=-1)
hidden_states = self.concat_linear(x_concat)
hidden_states = residual + hidden_states
else:
hidden_states = self.dropout(attention_output)
hidden_states = residual + hidden_states
if not self.normalize_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
# convolution module
if self.use_cnn_module:
residual = hidden_states
if self.normalize_before:
hidden_states = self.conv_layer_norm(hidden_states)
hidden_states = self.conv_module(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = residual + hidden_states
if not self.normalize_before:
hidden_states = self.conv_layer_norm(hidden_states)
# feed forward module
residual = hidden_states
if self.normalize_before:
hidden_states = self.ff_layer_norm(hidden_states)
hidden_states = self.feed_forward(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = residual + self.ff_scale * hidden_states
if not self.normalize_before:
hidden_states = self.ff_layer_norm(hidden_states)
if self.conv_module is not None:
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attention_scores,)
return outputs
class FastSpeech2ConformerMultiLayeredConv1d(nn.Module):
"""
Multi-layered conv1d for Transformer block.
This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer
block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech'
https://arxiv.org/pdf/1905.09263.pdf
"""
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
"""
Initialize FastSpeech2ConformerMultiLayeredConv1d module.
Args:
input_channels (`int`): Number of input channels.
hidden_channels (`int`): Number of hidden channels.
kernel_size (`int`): Kernel size of conv1d.
dropout_rate (`float`): Dropout rate.
"""
super().__init__()
input_channels = config.hidden_size
hidden_channels = module_config["linear_units"]
kernel_size = config.positionwise_conv_kernel_size
self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
self.dropout = nn.Dropout(module_config["dropout_rate"])
def forward(self, hidden_states):
"""
Calculate forward propagation.
Args:
hidden_states (torch.Tensor): Batch of input tensors (batch_size, time, input_channels).
Returns:
torch.Tensor: Batch of output tensors (batch_size, time, hidden_channels).
"""
hidden_states = hidden_states.transpose(-1, 1)
hidden_states = self.conv1(hidden_states)
hidden_states = torch.relu(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = hidden_states.transpose(-1, 1)
return hidden_states
class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
"""
Args:
Relative positional encoding module (new implementation). Details can be found in
https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://arxiv.org/abs/1901.02860
config (`FastSpeech2ConformerConfig`):
FastSpeech2ConformerConfig instance.
module_config (`dict`):
Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
"""
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
"""
Construct an PositionalEncoding object.
"""
super().__init__()
self.embed_dim = config.hidden_size
self.input_scale = math.sqrt(self.embed_dim)
self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"])
self.pos_enc = None
self.max_len = 5000
self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len))
def extend_pos_enc(self, x):
"""Reset the positional encodings."""
if self.pos_enc is not None:
# self.pos_enc contains both positive and negative parts
# the length of self.pos_enc is 2 * input_len - 1
if self.pos_enc.size(1) >= x.size(1) * 2 - 1:
if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device:
self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device)
return
# Suppose `i` means to the position of query vector and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
pos_enc_positive = torch.zeros(x.size(1), self.embed_dim)
pos_enc_negative = torch.zeros(x.size(1), self.embed_dim)
position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.embed_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embed_dim)
)
pos_enc_positive[:, 0::2] = torch.sin(position * div_term)
pos_enc_positive[:, 1::2] = torch.cos(position * div_term)
pos_enc_negative[:, 0::2] = torch.sin(-1 * position * div_term)
pos_enc_negative[:, 1::2] = torch.cos(-1 * position * div_term)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pos_enc_positive = torch.flip(pos_enc_positive, [0]).unsqueeze(0)
pos_enc_negative = pos_enc_negative[1:].unsqueeze(0)
pos_enc = torch.cat([pos_enc_positive, pos_enc_negative], dim=1)
self.pos_enc = pos_enc.to(device=x.device, dtype=x.dtype)
def forward(self, feature_representation):
"""
Args:
feature_representation (`torch.Tensor` of shape (batch_size, time, `*`)):
Input tensor.
Returns:
`torch.Tensor`: Encoded tensor (batch_size, time, `*`).
"""
self.extend_pos_enc(feature_representation)
hidden_states = feature_representation * self.input_scale
center_idx = self.pos_enc.size(1) // 2
pos_emb = self.pos_enc[:, center_idx - hidden_states.size(1) + 1 : center_idx + hidden_states.size(1)]
return self.dropout(hidden_states), self.dropout(pos_emb)
class FastSpeech2ConformerEncoder(nn.Module):
"""
FastSpeech2ConformerEncoder encoder module.
Args:
config (`FastSpeech2ConformerConfig`):
FastSpeech2ConformerConfig instance.
module_config (`dict`):
Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
use_encoder_input_layer (`bool`, *optional*, defaults to `False`):
Input layer type.
"""
def __init__(
self,
config: FastSpeech2ConformerConfig,
module_config,
use_encoder_input_layer=False,
):
super().__init__()
self.embed = None
if use_encoder_input_layer:
self.embed = nn.Embedding(
num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=0
)
self.pos_enc = FastSpeech2ConformerRelPositionalEncoding(config, module_config)
self.conformer_layers = nn.ModuleList(
[FastSpeech2ConformerEncoderLayer(config, module_config) for _ in range(module_config["layers"])]
)
def forward(
self,
input_tensor: torch.LongTensor,
attention_mask: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = False,
return_dict: Optional[bool] = None,
):
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Returns:
`torch.Tensor`:
Output tensor of shape `(batch, time, attention_dim)`.
"""
feature_representation = input_tensor
if self.embed is not None:
feature_representation = self.embed(feature_representation)
hidden_states, pos_emb = self.pos_enc(feature_representation)
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for conformer_layer in self.conformer_layers:
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = conformer_layer(hidden_states, pos_emb, attention_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# Add last layer
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
)
class FastSpeech2ConformerLoss(nn.Module):
def __init__(self, config: FastSpeech2ConformerConfig):
super().__init__()
use_masking = config.use_masking
use_weighted_masking = config.use_weighted_masking
if use_masking and use_weighted_masking:
raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
# define criterions
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
self.mse_criterion = nn.MSELoss(reduction=reduction)
self.duration_criterion = nn.MSELoss(reduction=reduction)
self.log_domain_offset = 1.0
def forward(
self,
outputs_after_postnet,
outputs_before_postnet,
duration_outputs,
pitch_outputs,
energy_outputs,
spectrogram_labels,
duration_labels,
pitch_labels,
energy_labels,
duration_mask,
spectrogram_mask,
):
"""
Args:
outputs_after_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
Batch of outputs after postnet.
outputs_before_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
Batch of outputs before postnet.
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length)`):
Batch of outputs of duration predictor.
pitch_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
Batch of outputs of pitch predictor.
energy_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
Batch of outputs of energy predictor.
spectrogram_labels (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
Batch of target features.
duration_labels (`torch.LongTensor` of shape `(batch_size, max_text_length)`): Batch of durations.
pitch_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
Batch of target token-averaged pitch.
energy_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
Batch of target token-averaged energy.
duration_mask (`torch.LongTensor`):
Mask used to discern which values the duration loss should be calculated for.
spectrogram_mask (`torch.LongTensor`):
Mask used to discern which values the spectrogam loss should be calculated for.
Returns:
`tuple(torch.FloatTensor)`: Tuple of tensors containing, in order, the L1 loss value, duration predictor
loss value, pitch predictor loss value, and energy predictor loss value.
"""
pitch_and_energy_masks = duration_mask.unsqueeze(-1)
# apply mask to remove padded part
if self.use_masking:
outputs_before_postnet = outputs_before_postnet.masked_select(spectrogram_mask)
if outputs_after_postnet is not None:
outputs_after_postnet = outputs_after_postnet.masked_select(spectrogram_mask)
spectrogram_labels = spectrogram_labels.masked_select(spectrogram_mask)
duration_outputs = duration_outputs.masked_select(duration_mask)
duration_labels = duration_labels.masked_select(duration_mask)
pitch_outputs = pitch_outputs.masked_select(pitch_and_energy_masks)
energy_outputs = energy_outputs.masked_select(pitch_and_energy_masks)
pitch_labels = pitch_labels.masked_select(pitch_and_energy_masks)
energy_labels = energy_labels.masked_select(pitch_and_energy_masks)
# calculate loss
l1_loss = self.l1_criterion(outputs_before_postnet, spectrogram_labels)
if outputs_after_postnet is not None:
l1_loss = l1_loss + self.l1_criterion(outputs_after_postnet, spectrogram_labels)
duration_labels = torch.log(duration_labels.float() + self.log_domain_offset)
duration_loss = self.duration_criterion(duration_outputs, duration_labels)
pitch_loss = self.mse_criterion(pitch_outputs, pitch_labels)
energy_loss = self.mse_criterion(energy_outputs, energy_labels)
# make weighted mask and apply it
if self.use_weighted_masking:
spectrogram_mask = nn.functional.pad(
spectrogram_mask.transpose(1, 2),
[0, spectrogram_labels.size(1) - spectrogram_mask.size(1), 0, 0, 0, 0],
value=False,
).transpose(1, 2)
out_weights = spectrogram_mask.float() / spectrogram_mask.sum(dim=1, keepdim=True).float()
out_weights /= spectrogram_labels.size(0) * spectrogram_labels.size(2)
duration_weights = duration_mask.float() / duration_mask.sum(dim=1, keepdim=True).float()
duration_weights /= duration_labels.size(0)
# apply weight
l1_loss = l1_loss.mul(out_weights).masked_select(spectrogram_mask).sum()
duration_loss = duration_loss.mul(duration_weights).masked_select(duration_mask).sum()
pitch_weights = duration_weights.unsqueeze(-1)
pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
energy_loss = energy_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
return l1_loss + duration_loss + pitch_loss + energy_loss
class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = FastSpeech2ConformerConfig
base_model_prefix = "fastspeech2_conformer"
main_input_name = "input_ids"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.LayerNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
key = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-key, b=key)
elif isinstance(module, nn.Embedding):
module.weight.data.normal_()
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, FastSpeech2ConformerAttention):
nn.init.xavier_uniform_(module.pos_bias_u)
nn.init.xavier_uniform_(module.pos_bias_v)
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, FastSpeech2ConformerEncoder):
module.gradient_checkpointing = value
@add_start_docstrings(
"""FastSpeech2Conformer Model.""",
FASTSPEECH2_CONFORMER_START_DOCSTRING,
)
class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
"""
FastSpeech 2 module.
This is a module of FastSpeech 2 described in 'FastSpeech 2: Fast and High-Quality End-to-End Text to Speech'
https://arxiv.org/abs/2006.04558. Instead of quantized pitch and energy, we use token-averaged value introduced in
FastPitch: Parallel Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers instead of regular
Transformers.
"""
def __init__(self, config: FastSpeech2ConformerConfig):
super().__init__(config)
self.config = config
# store hyperparameters
self.vocab_size = config.vocab_size
self.num_mel_bins = config.num_mel_bins
self.hidden_size = config.hidden_size
self.reduction_factor = config.reduction_factor
self.stop_gradient_from_pitch_predictor = config.stop_gradient_from_pitch_predictor
self.stop_gradient_from_energy_predictor = config.stop_gradient_from_energy_predictor
self.multilingual_model = config.num_languages is not None and config.num_languages > 1
if self.multilingual_model:
self.language_id_embedding = torch.nn.Embedding(config.num_languages, self.hidden_size)
self.multispeaker_model = config.num_speakers is not None and config.num_speakers > 1
if self.multispeaker_model:
self.speaker_id_embedding = torch.nn.Embedding(config.num_speakers, config.hidden_size)
self.speaker_embed_dim = config.speaker_embed_dim
if self.speaker_embed_dim:
self.projection = nn.Linear(config.hidden_size + self.speaker_embed_dim, config.hidden_size)
self.encoder = FastSpeech2ConformerEncoder(config, config.encoder_config, use_encoder_input_layer=True)
self.duration_predictor = FastSpeech2ConformerDurationPredictor(config)
self.pitch_predictor = FastSpeech2ConformerVariancePredictor(
config,
num_layers=config.pitch_predictor_layers,
num_chans=config.pitch_predictor_channels,
kernel_size=config.pitch_predictor_kernel_size,
dropout_rate=config.pitch_predictor_dropout,
)
# continuous pitch + FastPitch style avg
self.pitch_embed = FastSpeech2ConformerVarianceEmbedding(
out_channels=self.hidden_size,
kernel_size=config.pitch_embed_kernel_size,
padding=(config.pitch_embed_kernel_size - 1) // 2,
dropout_rate=config.pitch_embed_dropout,
)
self.energy_predictor = FastSpeech2ConformerVariancePredictor(
config,
num_layers=config.energy_predictor_layers,
num_chans=config.energy_predictor_channels,
kernel_size=config.energy_predictor_kernel_size,
dropout_rate=config.energy_predictor_dropout,
)
# continuous energy + FastPitch style avg
self.energy_embed = FastSpeech2ConformerVarianceEmbedding(
out_channels=self.hidden_size,
kernel_size=config.energy_embed_kernel_size,
padding=(config.energy_embed_kernel_size - 1) // 2,
dropout_rate=config.energy_embed_dropout,
)
# The decoder is an encoder
self.decoder = FastSpeech2ConformerEncoder(config, config.decoder_config, use_encoder_input_layer=False)
self.speech_decoder_postnet = FastSpeech2ConformerSpeechDecoderPostnet(config)
self.criterion = FastSpeech2ConformerLoss(config)
self.post_init()
@replace_return_docstrings(output_type=FastSpeech2ConformerModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.LongTensor] = None,
spectrogram_labels: Optional[torch.FloatTensor] = None,
duration_labels: Optional[torch.LongTensor] = None,
pitch_labels: Optional[torch.FloatTensor] = None,
energy_labels: Optional[torch.FloatTensor] = None,
speaker_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
speaker_embedding: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> Union[Tuple, FastSpeech2ConformerModelOutput]:
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Input sequence of text vectors.
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to `None`):
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
`[0, 1]`: 0 for tokens that are **masked**, 1 for tokens that are **not masked**.
spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
Batch of padded target features.
duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
Batch of padded durations.
pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
Batch of padded token-averaged pitch.
energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
Batch of padded token-averaged energy.
speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
Speaker ids used to condition features of speech output by the model.
lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
Language ids used to condition features of speech output by the model.
speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
Embedding containing conditioning signals for the features of the speech.
return_dict (`bool`, *optional*, defaults to `None`):
Whether or not to return a [`FastSpeech2ConformerModelOutput`] instead of a plain tuple.
output_attentions (`bool`, *optional*, defaults to `None`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*, defaults to `None`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
Returns:
Example:
```python
>>> from transformers import (
... FastSpeech2ConformerTokenizer,
... FastSpeech2ConformerModel,
... FastSpeech2ConformerHifiGan,
... )
>>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
>>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
>>> output_dict = model(input_ids, return_dict=True)
>>> spectrogram = output_dict["spectrogram"]
>>> vocoder = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan")
>>> waveform = vocoder(spectrogram)
>>> print(waveform.shape)
torch.Size([1, 49664])
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if attention_mask is None:
attention_mask = torch.ones(input_ids.shape)
has_missing_labels = (
spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
)
if self.training and has_missing_labels:
raise ValueError("All labels must be provided to run in training mode.")
# forward encoder
text_masks = attention_mask.unsqueeze(-2)
encoder_outputs = self.encoder(
input_ids,
text_masks,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_dict=return_dict,
)
hidden_states = encoder_outputs[0]
# Integrate with language id, speaker id, and speaker embedding
if self.multispeaker_model and speaker_ids is not None:
speaker_id_embeddings = self.speaker_id_embedding(speaker_ids.view(-1))
hidden_states = hidden_states + speaker_id_embeddings.unsqueeze(1)
if self.multilingual_model and lang_ids is not None:
language_id_embbedings = self.language_id_embedding(lang_ids.view(-1))
hidden_states = hidden_states + language_id_embbedings.unsqueeze(1)
if self.speaker_embed_dim is not None and speaker_embedding is not None:
embeddings_expanded = (
nn.functional.normalize(speaker_embedding).unsqueeze(1).expand(-1, hidden_states.size(1), -1)
)
hidden_states = self.projection(torch.cat([hidden_states, embeddings_expanded], dim=-1))
# forward duration predictor and variance predictors
duration_mask = ~attention_mask.bool()
if self.stop_gradient_from_pitch_predictor:
pitch_predictions = self.pitch_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
else:
pitch_predictions = self.pitch_predictor(hidden_states, duration_mask.unsqueeze(-1))
if self.stop_gradient_from_energy_predictor:
energy_predictions = self.energy_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
else:
energy_predictions = self.energy_predictor(hidden_states, duration_mask.unsqueeze(-1))
duration_predictions = self.duration_predictor(hidden_states)
duration_predictions = duration_predictions.masked_fill(duration_mask, 0.0)
if not self.training:
# use prediction in inference
embedded_pitch_curve = self.pitch_embed(pitch_predictions)
embedded_energy_curve = self.energy_embed(energy_predictions)
hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
hidden_states = length_regulator(hidden_states, duration_predictions, self.config.speaking_speed)
else:
# use groundtruth in training
embedded_pitch_curve = self.pitch_embed(pitch_labels)
embedded_energy_curve = self.energy_embed(energy_labels)
hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
hidden_states = length_regulator(hidden_states, duration_labels)
# forward decoder
if not self.training:
hidden_mask = None
else:
spectrogram_mask = (spectrogram_labels != -100).any(dim=-1)
spectrogram_mask = spectrogram_mask.int()
if self.reduction_factor > 1:
length_dim = spectrogram_mask.shape[1] - spectrogram_mask.shape[1] % self.reduction_factor
spectrogram_mask = spectrogram_mask[:, :, :length_dim]
hidden_mask = spectrogram_mask.unsqueeze(-2)
decoder_outputs = self.decoder(
hidden_states,
hidden_mask,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_dict=return_dict,
)
outputs_before_postnet, outputs_after_postnet = self.speech_decoder_postnet(decoder_outputs[0])
loss = None
if self.training:
# calculate loss
loss_duration_mask = ~duration_mask
loss_spectrogram_mask = spectrogram_mask.unsqueeze(-1).bool()
loss = self.criterion(
outputs_after_postnet=outputs_after_postnet,
outputs_before_postnet=outputs_before_postnet,
duration_outputs=duration_predictions,
pitch_outputs=pitch_predictions,
energy_outputs=energy_predictions,
spectrogram_labels=spectrogram_labels,
duration_labels=duration_labels,
pitch_labels=pitch_labels,
energy_labels=energy_labels,
duration_mask=loss_duration_mask,
spectrogram_mask=loss_spectrogram_mask,
)
if not return_dict:
postnet_outputs = (outputs_after_postnet,)
audio_feature_predictions = (
duration_predictions,
pitch_predictions,
energy_predictions,
)
outputs = postnet_outputs + encoder_outputs + decoder_outputs[1:] + audio_feature_predictions
return ((loss,) + outputs) if loss is not None else outputs
return FastSpeech2ConformerModelOutput(
loss=loss,
spectrogram=outputs_after_postnet,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
duration_outputs=duration_predictions,
pitch_outputs=pitch_predictions,
energy_outputs=energy_predictions,
)
# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
class HifiGanResidualBlock(nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
super().__init__()
self.leaky_relu_slope = leaky_relu_slope
self.convs1 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=dilation[i],
padding=self.get_padding(kernel_size, dilation[i]),
)
for i in range(len(dilation))
]
)
self.convs2 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=1,
padding=self.get_padding(kernel_size, 1),
)
for _ in range(len(dilation))
]
)
def get_padding(self, kernel_size, dilation=1):
return (kernel_size * dilation - dilation) // 2
def apply_weight_norm(self):
for layer in self.convs1:
nn.utils.weight_norm(layer)
for layer in self.convs2:
nn.utils.weight_norm(layer)
def remove_weight_norm(self):
for layer in self.convs1:
nn.utils.remove_weight_norm(layer)
for layer in self.convs2:
nn.utils.remove_weight_norm(layer)
def forward(self, hidden_states):
for conv1, conv2 in zip(self.convs1, self.convs2):
residual = hidden_states
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv1(hidden_states)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv2(hidden_states)
hidden_states = hidden_states + residual
return hidden_states
@add_start_docstrings(
"""HiFi-GAN vocoder.""",
HIFIGAN_START_DOCSTRING,
)
# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5HifiGan with SpeechT5->FastSpeech2Conformer
class FastSpeech2ConformerHifiGan(PreTrainedModel):
config_class = FastSpeech2ConformerHifiGanConfig
main_input_name = "spectrogram"
def __init__(self, config: FastSpeech2ConformerHifiGanConfig):
super().__init__(config)
self.num_kernels = len(config.resblock_kernel_sizes)
self.num_upsamples = len(config.upsample_rates)
self.conv_pre = nn.Conv1d(
config.model_in_dim,
config.upsample_initial_channel,
kernel_size=7,
stride=1,
padding=3,
)
self.upsampler = nn.ModuleList()
for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
self.upsampler.append(
nn.ConvTranspose1d(
config.upsample_initial_channel // (2**i),
config.upsample_initial_channel // (2 ** (i + 1)),
kernel_size=kernel_size,
stride=upsample_rate,
padding=(kernel_size - upsample_rate) // 2,
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.upsampler)):
channels = config.upsample_initial_channel // (2 ** (i + 1))
for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
self.register_buffer("mean", torch.zeros(config.model_in_dim))
self.register_buffer("scale", torch.ones(config.model_in_dim))
# Initialize weights and apply final processing
self.post_init()
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
def apply_weight_norm(self):
nn.utils.weight_norm(self.conv_pre)
for layer in self.upsampler:
nn.utils.weight_norm(layer)
for layer in self.resblocks:
layer.apply_weight_norm()
nn.utils.weight_norm(self.conv_post)
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.conv_pre)
for layer in self.upsampler:
nn.utils.remove_weight_norm(layer)
for layer in self.resblocks:
layer.remove_weight_norm()
nn.utils.remove_weight_norm(self.conv_post)
def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
r"""
Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
waveform.
Args:
spectrogram (`torch.FloatTensor`):
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
Returns:
`torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
"""
if self.config.normalize_before:
spectrogram = (spectrogram - self.mean) / self.scale
is_batched = spectrogram.dim() == 3
if not is_batched:
spectrogram = spectrogram.unsqueeze(0)
hidden_states = spectrogram.transpose(2, 1)
hidden_states = self.conv_pre(hidden_states)
for i in range(self.num_upsamples):
hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
hidden_states = self.upsampler[i](hidden_states)
res_state = self.resblocks[i * self.num_kernels](hidden_states)
for j in range(1, self.num_kernels):
res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
hidden_states = res_state / self.num_kernels
hidden_states = nn.functional.leaky_relu(hidden_states)
hidden_states = self.conv_post(hidden_states)
hidden_states = torch.tanh(hidden_states)
if not is_batched:
# remove batch dim and collapse tensor to 1-d audio waveform
waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
else:
# remove seq-len dim since this collapses to 1
waveform = hidden_states.squeeze(1)
return waveform
@add_start_docstrings(
"The FastSpeech2ConformerModel with a FastSpeech2ConformerHifiGan vocoder head that performs text-to-speech (waveform).",
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING,
)
class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
config_class = FastSpeech2ConformerWithHifiGanConfig
def __init__(self, config: FastSpeech2ConformerWithHifiGanConfig):
super().__init__(config)
self.model = FastSpeech2ConformerModel(config.model_config)
self.vocoder = FastSpeech2ConformerHifiGan(config.vocoder_config)
self.config = config
@replace_return_docstrings(
output_type=FastSpeech2ConformerWithHifiGanOutput, config_class=FastSpeech2ConformerWithHifiGanConfig
)
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.LongTensor] = None,
spectrogram_labels: Optional[torch.FloatTensor] = None,
duration_labels: Optional[torch.LongTensor] = None,
pitch_labels: Optional[torch.FloatTensor] = None,
energy_labels: Optional[torch.FloatTensor] = None,
speaker_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
speaker_embedding: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> Union[Tuple, FastSpeech2ConformerModelOutput]:
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Input sequence of text vectors.
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to `None`):
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
`[0, 1]`: 0 for tokens that are **masked**, 1 for tokens that are **not masked**.
spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
Batch of padded target features.
duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
Batch of padded durations.
pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
Batch of padded token-averaged pitch.
energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
Batch of padded token-averaged energy.
speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
Speaker ids used to condition features of speech output by the model.
lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
Language ids used to condition features of speech output by the model.
speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
Embedding containing conditioning signals for the features of the speech.
return_dict (`bool`, *optional*, defaults to `None`):
Whether or not to return a [`FastSpeech2ConformerModelOutput`] instead of a plain tuple.
output_attentions (`bool`, *optional*, defaults to `None`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*, defaults to `None`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
Returns:
Example:
```python
>>> from transformers import (
... FastSpeech2ConformerTokenizer,
... FastSpeech2ConformerWithHifiGan,
... )
>>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
>>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan")
>>> output_dict = model(input_ids, return_dict=True)
>>> waveform = output_dict["waveform"]
>>> print(waveform.shape)
torch.Size([1, 49664])
```
"""
return_dict = return_dict if return_dict is not None else self.config.model_config.use_return_dict
output_attentions = (
output_attentions if output_attentions is not None else self.config.model_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.model_config.output_hidden_states
)
model_outputs = self.model(
input_ids,
attention_mask,
spectrogram_labels=spectrogram_labels,
duration_labels=duration_labels,
pitch_labels=pitch_labels,
energy_labels=energy_labels,
speaker_ids=speaker_ids,
lang_ids=lang_ids,
speaker_embedding=speaker_embedding,
return_dict=return_dict,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
if not return_dict:
has_missing_labels = (
spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
)
if has_missing_labels:
spectrogram = model_outputs[0]
else:
spectrogram = model_outputs[1]
else:
spectrogram = model_outputs["spectrogram"]
waveform = self.vocoder(spectrogram)
if not return_dict:
return model_outputs + (waveform,)
return FastSpeech2ConformerWithHifiGanOutput(waveform=waveform, **model_outputs)
# coding=utf-8
# Copyright 2023 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for FastSpeech2Conformer."""
import json
import os
from typing import Optional, Tuple
import regex
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging, requires_backends
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/vocab.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
# Set to somewhat arbitrary large number as the model input
# isn't constrained by the relative positional encoding
"espnet/fastspeech2_conformer": 4096,
}
class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
"""
Construct a FastSpeech2Conformer tokenizer.
Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
The begin of sequence token. Note that for FastSpeech2, it is the same as the `eos_token`.
eos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
The end of sequence token. Note that for FastSpeech2, it is the same as the `bos_token`.
pad_token (`str`, *optional*, defaults to `"<blank>"`):
The token used for padding, for example when batching sequences of different lengths.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
should_strip_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to strip the spaces from the list of tokens.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
bos_token="<sos/eos>",
eos_token="<sos/eos>",
pad_token="<blank>",
unk_token="<unk>",
should_strip_spaces=False,
**kwargs,
):
requires_backends(self, "g2p_en")
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
import g2p_en
self.g2p = g2p_en.G2p()
self.decoder = {v: k for k, v in self.encoder.items()}
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
should_strip_spaces=should_strip_spaces,
**kwargs,
)
self.should_strip_spaces = should_strip_spaces
@property
def vocab_size(self):
return len(self.decoder)
def get_vocab(self):
"Returns vocab as a dict"
return dict(self.encoder, **self.added_tokens_encoder)
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
# expand symbols
text = regex.sub(";", ",", text)
text = regex.sub(":", ",", text)
text = regex.sub("-", " ", text)
text = regex.sub("&", "and", text)
# strip unnecessary symbols
text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)
# strip whitespaces
text = regex.sub(r"\s+", " ", text)
text = text.upper()
return text, kwargs
def _tokenize(self, text):
"""Returns a tokenized string."""
# phonemize
tokens = self.g2p(text)
if self.should_strip_spaces:
tokens = list(filter(lambda s: s != " ", tokens))
tokens.append(self.eos_token)
return tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
# Override since phonemes cannot be converted back to strings
def decode(self, token_ids, **kwargs):
logger.warn(
"Phonemes cannot be reliably converted to a string due to the one-many mapping, converting to tokens instead."
)
return self.convert_ids_to_tokens(token_ids)
# Override since phonemes cannot be converted back to strings
def convert_tokens_to_string(self, tokens, **kwargs):
logger.warn(
"Phonemes cannot be reliably converted to a string due to the one-many mapping, returning the tokens."
)
return tokens
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.get_vocab(), ensure_ascii=False))
return (vocab_file,)
def __getstate__(self):
state = self.__dict__.copy()
state["g2p"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import g2p_en
self.g2p = g2p_en.G2p()
except ImportError:
raise ImportError(
"You need to install g2p-en to use FastSpeech2ConformerTokenizer. "
"See https://pypi.org/project/g2p-en/ for installation."
)
...@@ -2655,6 +2655,7 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel): ...@@ -2655,6 +2655,7 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
speaker_embeddings: Optional[torch.FloatTensor] = None, speaker_embeddings: Optional[torch.FloatTensor] = None,
labels: Optional[torch.FloatTensor] = None, labels: Optional[torch.FloatTensor] = None,
stop_labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, Seq2SeqSpectrogramOutput]: ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
r""" r"""
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
...@@ -2973,6 +2974,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): ...@@ -2973,6 +2974,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
speaker_embeddings: Optional[torch.FloatTensor] = None, speaker_embeddings: Optional[torch.FloatTensor] = None,
labels: Optional[torch.FloatTensor] = None, labels: Optional[torch.FloatTensor] = None,
stop_labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, Seq2SeqSpectrogramOutput]: ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
......
...@@ -67,6 +67,7 @@ from .utils import ( ...@@ -67,6 +67,7 @@ from .utils import (
is_flax_available, is_flax_available,
is_fsdp_available, is_fsdp_available,
is_ftfy_available, is_ftfy_available,
is_g2p_en_available,
is_ipex_available, is_ipex_available,
is_jieba_available, is_jieba_available,
is_jinja_available, is_jinja_available,
...@@ -365,6 +366,13 @@ def require_fsdp(test_case, min_version: str = "1.12.0"): ...@@ -365,6 +366,13 @@ def require_fsdp(test_case, min_version: str = "1.12.0"):
) )
def require_g2p_en(test_case):
"""
Decorator marking a test that requires g2p_en. These tests are skipped when SentencePiece isn't installed.
"""
return unittest.skipUnless(is_g2p_en_available(), "test requires g2p_en")(test_case)
def require_safetensors(test_case): def require_safetensors(test_case):
""" """
Decorator marking a test that requires safetensors. These tests are skipped when safetensors isn't installed. Decorator marking a test that requires safetensors. These tests are skipped when safetensors isn't installed.
......
...@@ -123,6 +123,7 @@ from .import_utils import ( ...@@ -123,6 +123,7 @@ from .import_utils import (
is_flax_available, is_flax_available,
is_fsdp_available, is_fsdp_available,
is_ftfy_available, is_ftfy_available,
is_g2p_en_available,
is_in_notebook, is_in_notebook,
is_ipex_available, is_ipex_available,
is_jieba_available, is_jieba_available,
......
...@@ -3422,6 +3422,37 @@ class FalconPreTrainedModel(metaclass=DummyObject): ...@@ -3422,6 +3422,37 @@ class FalconPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
class FastSpeech2ConformerHifiGan(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class FastSpeech2ConformerModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class FastSpeech2ConformerPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class FastSpeech2ConformerWithHifiGan(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -94,6 +94,7 @@ except importlib.metadata.PackageNotFoundError: ...@@ -94,6 +94,7 @@ except importlib.metadata.PackageNotFoundError:
except importlib.metadata.PackageNotFoundError: except importlib.metadata.PackageNotFoundError:
_faiss_available = False _faiss_available = False
_ftfy_available = _is_package_available("ftfy") _ftfy_available = _is_package_available("ftfy")
_g2p_en_available = _is_package_available("g2p_en")
_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True) _ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
_jieba_available = _is_package_available("jieba") _jieba_available = _is_package_available("jieba")
_jinja_available = _is_package_available("jinja2") _jinja_available = _is_package_available("jinja2")
...@@ -444,6 +445,10 @@ def is_ftfy_available(): ...@@ -444,6 +445,10 @@ def is_ftfy_available():
return _ftfy_available return _ftfy_available
def is_g2p_en_available():
return _g2p_en_available
@lru_cache() @lru_cache()
def is_torch_tpu_available(check_device=True): def is_torch_tpu_available(check_device=True):
"Checks if `torch_xla` is installed and potentially if a TPU is in the environment" "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
...@@ -1059,6 +1064,12 @@ LEVENSHTEIN_IMPORT_ERROR = """ ...@@ -1059,6 +1064,12 @@ LEVENSHTEIN_IMPORT_ERROR = """
install python-Levenshtein`. Please note that you may need to restart your runtime after installation. install python-Levenshtein`. Please note that you may need to restart your runtime after installation.
""" """
# docstyle-ignore
G2P_EN_IMPORT_ERROR = """
{0} requires the g2p-en library but it was not found in your environment. You can install it with pip:
`pip install g2p-en`. Please note that you may need to restart your runtime after installation.
"""
# docstyle-ignore # docstyle-ignore
PYTORCH_QUANTIZATION_IMPORT_ERROR = """ PYTORCH_QUANTIZATION_IMPORT_ERROR = """
{0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip: {0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
...@@ -1101,7 +1112,6 @@ SACREMOSES_IMPORT_ERROR = """ ...@@ -1101,7 +1112,6 @@ SACREMOSES_IMPORT_ERROR = """
`pip install sacremoses`. Please note that you may need to restart your runtime after installation. `pip install sacremoses`. Please note that you may need to restart your runtime after installation.
""" """
# docstyle-ignore # docstyle-ignore
SCIPY_IMPORT_ERROR = """ SCIPY_IMPORT_ERROR = """
{0} requires the scipy library but it was not found in your environment. You can install it with pip: {0} requires the scipy library but it was not found in your environment. You can install it with pip:
...@@ -1225,6 +1235,7 @@ BACKENDS_MAPPING = OrderedDict( ...@@ -1225,6 +1235,7 @@ BACKENDS_MAPPING = OrderedDict(
("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)), ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)), ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)), ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)), ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
......
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch FastSpeech2Conformer model."""
import inspect
import tempfile
import unittest
from transformers import (
FastSpeech2ConformerConfig,
FastSpeech2ConformerHifiGanConfig,
FastSpeech2ConformerTokenizer,
FastSpeech2ConformerWithHifiGanConfig,
is_torch_available,
)
from transformers.testing_utils import require_g2p_en, require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
if is_torch_available():
import torch
from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerWithHifiGan, set_seed
class FastSpeech2ConformerModelTester:
def __init__(
self,
parent,
batch_size=13,
num_hidden_layers=1,
num_attention_heads=2,
hidden_size=24,
seq_length=7,
encoder_linear_units=384,
decoder_linear_units=384,
is_training=False,
speech_decoder_postnet_units=128,
speech_decoder_postnet_layers=2,
pitch_predictor_layers=1,
energy_predictor_layers=1,
duration_predictor_layers=1,
num_mel_bins=8,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.vocab_size = hidden_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.encoder_linear_units = encoder_linear_units
self.decoder_linear_units = decoder_linear_units
self.speech_decoder_postnet_units = speech_decoder_postnet_units
self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
self.pitch_predictor_layers = pitch_predictor_layers
self.energy_predictor_layers = energy_predictor_layers
self.duration_predictor_layers = duration_predictor_layers
self.num_mel_bins = num_mel_bins
def prepare_config_and_inputs(self):
config = self.get_config()
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
return config, input_ids
def get_config(self):
return FastSpeech2ConformerConfig(
hidden_size=self.hidden_size,
encoder_layers=self.num_hidden_layers,
decoder_layers=self.num_hidden_layers,
encoder_linear_units=self.encoder_linear_units,
decoder_linear_units=self.decoder_linear_units,
speech_decoder_postnet_units=self.speech_decoder_postnet_units,
speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
num_mel_bins=self.num_mel_bins,
pitch_predictor_layers=self.pitch_predictor_layers,
energy_predictor_layers=self.energy_predictor_layers,
duration_predictor_layers=self.duration_predictor_layers,
)
def create_and_check_model(self, config, input_ids, *args):
model = FastSpeech2ConformerModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, return_dict=True)
# total of 5 keys in result
self.parent.assertEqual(len(result), 5)
# check batch sizes match
for value in result.values():
self.parent.assertEqual(value.size(0), self.batch_size)
# check duration, pitch, and energy have the appopriate shapes
# duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
# check predicted mel-spectrogram has correct dimension
self.parent.assertEqual(result["spectrogram"].size(2), model.config.num_mel_bins)
def prepare_config_and_inputs_for_common(self):
config, input_ids = self.prepare_config_and_inputs()
inputs_dict = {"input_ids": input_ids}
return config, inputs_dict
@require_torch
class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (FastSpeech2ConformerModel,) if is_torch_available() else ()
test_pruning = False
test_headmasking = False
test_torchscript = False
test_resize_embeddings = False
is_encoder_decoder = True
def setUp(self):
self.model_tester = FastSpeech2ConformerModelTester(self)
self.config_tester = ConfigTester(self, config_class=FastSpeech2ConformerConfig)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_initialization(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
msg = f"Parameter {name} of model {model_class} seems not properly initialized"
if "norm" in name:
if "bias" in name:
self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
if "weight" in name:
self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
elif "conv" in name or "embed" in name:
self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
def test_duration_energy_pitch_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
seq_len = self.model_tester.seq_length
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
# duration
self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len])
# energy
self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
# pitch
self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
def test_hidden_states_output(self):
def _check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]):
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
self.assertIsInstance(hidden_states, (list, tuple))
expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape
self.assertEqual(expected_batch_size, self.model_tester.batch_size)
# Only test encoder seq_length since decoder seq_length is variable based on inputs
if idx == 0:
self.assertEqual(expected_seq_length, self.model_tester.seq_length)
self.assertEqual(expected_hidden_size, self.model_tester.hidden_size)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
inputs_dict["output_hidden_states"] = True
_check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
_check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel)
def test_save_load_strict(self):
config, _ = self.model_tester.prepare_config_and_inputs()
model = FastSpeech2ConformerModel(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
_, info = FastSpeech2ConformerModel.from_pretrained(tmpdirname, output_loading_info=True)
self.assertEqual(info["missing_keys"], [])
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
model = FastSpeech2ConformerModel(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = [
"input_ids",
"attention_mask",
"spectrogram_labels",
"duration_labels",
"pitch_labels",
"energy_labels",
"speaker_ids",
"lang_ids",
"speaker_embedding",
"return_dict",
"output_attentions",
"output_hidden_states",
]
self.assertListEqual(arg_names, expected_arg_names)
# Override as FastSpeech2Conformer does not output cross attentions
def test_retain_grad_hidden_states_attentions(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True
config.output_attentions = True
model = FastSpeech2ConformerModel(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel)
outputs = model(**inputs)
output = outputs[0]
encoder_hidden_states = outputs.encoder_hidden_states[0]
encoder_hidden_states.retain_grad()
decoder_hidden_states = outputs.decoder_hidden_states[0]
decoder_hidden_states.retain_grad()
encoder_attentions = outputs.encoder_attentions[0]
encoder_attentions.retain_grad()
decoder_attentions = outputs.decoder_attentions[0]
decoder_attentions.retain_grad()
output.flatten()[0].backward(retain_graph=True)
self.assertIsNotNone(encoder_hidden_states.grad)
self.assertIsNotNone(decoder_hidden_states.grad)
self.assertIsNotNone(encoder_attentions.grad)
self.assertIsNotNone(decoder_attentions.grad)
def test_attention_outputs(self):
"""
Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable
decoder attention shape, and uniquely outputs energy, pitch, and durations.
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
seq_len = self.model_tester.seq_length
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
encoder_attentions = outputs.encoder_attentions
self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(encoder_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
out_len = len(outputs)
correct_outlen = 7
self.assertEqual(out_len, correct_outlen)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
added_hidden_states = 2
self.assertEqual(out_len + added_hidden_states, len(outputs))
self_attentions = outputs.encoder_attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
@slow
def test_model_from_pretrained(self):
model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
self.assertIsNotNone(model)
@unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="FastSpeech2Conformer has no input embeddings")
def test_model_common_attributes(self):
pass
@require_torch
@require_g2p_en
@slow
class FastSpeech2ConformerModelIntegrationTest(unittest.TestCase):
def test_inference_integration(self):
model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
model.to(torch_device)
model.eval()
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
text = "Test that this generates speech"
input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"]
outputs_dict = model(input_ids)
spectrogram = outputs_dict["spectrogram"]
# mel-spectrogram is too large (1, 205, 80), so only check top-left 100 elements
# fmt: off
expected_mel_spectrogram = torch.tensor(
[
[-1.2426, -1.7286, -1.6754, -1.7451, -1.6402, -1.5219, -1.4480, -1.3345, -1.4031, -1.4497],
[-0.7858, -1.4966, -1.3602, -1.4876, -1.2949, -1.0723, -1.0021, -0.7553, -0.6521, -0.6929],
[-0.7298, -1.3908, -1.0369, -1.2656, -1.0342, -0.7883, -0.7420, -0.5249, -0.3734, -0.3977],
[-0.4784, -1.3508, -1.1558, -1.4678, -1.2820, -1.0252, -1.0868, -0.9006, -0.8947, -0.8448],
[-0.3963, -1.2895, -1.2813, -1.6147, -1.4658, -1.2560, -1.4134, -1.2650, -1.3255, -1.1715],
[-1.4914, -1.3097, -0.3821, -0.3898, -0.5748, -0.9040, -1.0755, -1.0575, -1.2205, -1.0572],
[0.0197, -0.0582, 0.9147, 1.1512, 1.1651, 0.6628, -0.1010, -0.3085, -0.2285, 0.2650],
[1.1780, 0.1803, 0.7251, 1.5728, 1.6678, 0.4542, -0.1572, -0.1787, 0.0744, 0.8168],
[-0.2078, -0.3211, 1.1096, 1.5085, 1.4632, 0.6299, -0.0515, 0.0589, 0.8609, 1.4429],
[0.7831, -0.2663, 1.0352, 1.4489, 0.9088, 0.0247, -0.3995, 0.0078, 1.2446, 1.6998],
],
device=torch_device,
)
# fmt: on
self.assertTrue(torch.allclose(spectrogram[0, :10, :10], expected_mel_spectrogram, atol=1e-4))
self.assertEqual(spectrogram.shape, (1, 205, model.config.num_mel_bins))
def test_training_integration(self):
model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
model.to(torch_device)
# Set self.training manually to keep deterministic but run the training path
model.training = True
set_seed(0)
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
text = "Test that this generates speech"
input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"]
# NOTE: Dummy numbers since FastSpeech2Conformer does not have a feature extractor due to the package deps required (librosa, MFA)
batch_size, max_text_len = input_ids.shape
pitch_labels = torch.rand((batch_size, max_text_len, 1), dtype=torch.float, device=torch_device)
energy_labels = torch.rand((batch_size, max_text_len, 1), dtype=torch.float, device=torch_device)
duration_labels = torch.normal(10, 2, size=(batch_size, max_text_len)).clamp(1, 20).int()
max_target_len, _ = duration_labels.sum(dim=1).max(dim=0)
max_target_len = max_target_len.item()
spectrogram_labels = torch.rand(
(batch_size, max_target_len, model.num_mel_bins), dtype=torch.float, device=torch_device
)
outputs_dict = model(
input_ids,
spectrogram_labels=spectrogram_labels,
duration_labels=duration_labels,
pitch_labels=pitch_labels,
energy_labels=energy_labels,
return_dict=True,
)
spectrogram = outputs_dict["spectrogram"]
loss = outputs_dict["loss"]
# # mel-spectrogram is too large (1, 224, 80), so only check top-left 100 elements
# fmt: off
expected_mel_spectrogram = torch.tensor(
[
[-1.0643e+00, -6.8058e-01, -1.0901e+00, -8.2724e-01, -7.7241e-01, -1.1905e+00, -8.5725e-01, -8.2930e-01, -1.1313e+00, -1.2449e+00],
[-5.5067e-01, -2.7045e-01, -6.3483e-01, -1.9320e-01, 1.0234e-01, -3.3253e-01, -2.4423e-01, -3.5045e-01, -5.2070e-01, -4.3710e-01],
[ 2.2181e-01, 3.1433e-01, -1.2849e-01, 6.0253e-01, 1.0033e+00, 1.3952e-01, 1.2851e-01, -2.3063e-02, -1.5092e-01, 2.4903e-01],
[ 4.6343e-01, 4.1820e-01, 1.6468e-01, 1.1297e+00, 1.4588e+00, 1.3737e-01, 6.6355e-02, -6.0973e-02, -5.4225e-02, 5.9208e-01],
[ 5.2762e-01, 4.8725e-01, 4.2735e-01, 1.4392e+00, 1.7398e+00, 2.4891e-01, -8.4531e-03, -8.1282e-02, 1.2857e-01, 8.7559e-01],
[ 5.2548e-01, 5.1653e-01, 5.2034e-01, 1.3782e+00, 1.5972e+00, 1.6380e-01, -5.1807e-02, 1.5474e-03, 2.2824e-01, 8.5288e-01],
[ 3.6356e-01, 4.4109e-01, 4.4257e-01, 9.4273e-01, 1.1201e+00, -9.0551e-03, -1.1627e-01, -2.0821e-02, 1.0793e-01, 5.0336e-01],
[ 3.6598e-01, 3.2708e-01, 1.3297e-01, 4.5162e-01, 6.4168e-01, -2.6923e-01, -2.3101e-01, -1.4943e-01, -1.4732e-01, 7.3057e-02],
[ 2.7639e-01, 2.2588e-01, -1.5310e-01, 1.0957e-01, 3.3048e-01, -5.3431e-01, -3.3822e-01, -2.8007e-01, -3.3823e-01, -1.5775e-01],
[ 2.9323e-01, 1.6723e-01, -3.4153e-01, -1.1209e-01, 1.7355e-01, -6.1724e-01, -5.4201e-01, -4.9944e-01, -5.2212e-01, -2.7596e-01]
],
device=torch_device,
)
# fmt: on
expected_loss = torch.tensor(74.4595, device=torch_device)
self.assertTrue(torch.allclose(spectrogram[0, :10, :10], expected_mel_spectrogram, atol=1e-3))
self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4))
self.assertEqual(spectrogram.shape, (1, 224, model.config.num_mel_bins))
class FastSpeech2ConformerWithHifiGanTester:
def __init__(
self,
parent,
batch_size=13,
num_hidden_layers=1,
num_attention_heads=2,
hidden_size=24,
seq_length=7,
encoder_linear_units=384,
decoder_linear_units=384,
is_training=False,
speech_decoder_postnet_units=128,
speech_decoder_postnet_layers=2,
pitch_predictor_layers=1,
energy_predictor_layers=1,
duration_predictor_layers=1,
num_mel_bins=8,
upsample_initial_channel=64,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.vocab_size = hidden_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.encoder_linear_units = encoder_linear_units
self.decoder_linear_units = decoder_linear_units
self.speech_decoder_postnet_units = speech_decoder_postnet_units
self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
self.pitch_predictor_layers = pitch_predictor_layers
self.energy_predictor_layers = energy_predictor_layers
self.duration_predictor_layers = duration_predictor_layers
self.num_mel_bins = num_mel_bins
self.upsample_initial_channel = upsample_initial_channel
def prepare_config_and_inputs(self):
config = self.get_config()
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
return config, input_ids
def get_config(self):
self.model_config = FastSpeech2ConformerConfig(
hidden_size=self.hidden_size,
encoder_layers=self.num_hidden_layers,
decoder_layers=self.num_hidden_layers,
encoder_linear_units=self.encoder_linear_units,
decoder_linear_units=self.decoder_linear_units,
speech_decoder_postnet_units=self.speech_decoder_postnet_units,
speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
num_mel_bins=self.num_mel_bins,
pitch_predictor_layers=self.pitch_predictor_layers,
energy_predictor_layers=self.energy_predictor_layers,
duration_predictor_layers=self.duration_predictor_layers,
)
self.vocoder_config = FastSpeech2ConformerHifiGanConfig(
model_in_dim=self.num_mel_bins, upsample_initial_channel=self.upsample_initial_channel
)
return FastSpeech2ConformerWithHifiGanConfig(
model_config=self.model_config.to_dict(), vocoder_config=self.vocoder_config.to_dict()
)
def create_and_check_model(self, config, input_ids, *args):
model = FastSpeech2ConformerWithHifiGan(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, return_dict=True)
# total of 5 keys in result
self.parent.assertEqual(len(result), 6)
# check batch sizes match
for value in result.values():
self.parent.assertEqual(value.size(0), self.batch_size)
# check duration, pitch, and energy have the appopriate shapes
# duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
# check predicted mel-spectrogram has correct dimension
self.parent.assertEqual(result["spectrogram"].size(2), model.config.model_config.num_mel_bins)
def prepare_config_and_inputs_for_common(self):
config, input_ids = self.prepare_config_and_inputs()
inputs_dict = {"input_ids": input_ids}
return config, inputs_dict
class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (FastSpeech2ConformerWithHifiGan,) if is_torch_available() else ()
test_pruning = False
test_headmasking = False
test_torchscript = False
test_resize_embeddings = False
is_encoder_decoder = True
def setUp(self):
self.model_tester = FastSpeech2ConformerWithHifiGanTester(self)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_initialization(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
msg = f"Parameter {name} of model {model_class} seems not properly initialized"
if "norm" in name:
if "bias" in name:
self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
if "weight" in name:
self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
elif "conv" in name or "embed" in name:
self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
return inputs_dict
def test_duration_energy_pitch_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.model_config.return_dict = True
seq_len = self.model_tester.seq_length
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
# duration
self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len])
# energy
self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
# pitch
self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
def test_hidden_states_output(self):
def _check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]):
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
self.assertIsInstance(hidden_states, (list, tuple))
expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape
self.assertEqual(expected_batch_size, self.model_tester.batch_size)
# Only test encoder seq_length since decoder seq_length is variable based on inputs
if idx == 0:
self.assertEqual(expected_seq_length, self.model_tester.seq_length)
self.assertEqual(expected_hidden_size, self.model_tester.hidden_size)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
inputs_dict["output_hidden_states"] = True
_check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.model_config.output_hidden_states = True
_check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan)
def test_save_load_strict(self):
config, _ = self.model_tester.prepare_config_and_inputs()
model = FastSpeech2ConformerWithHifiGan(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
_, info = FastSpeech2ConformerWithHifiGan.from_pretrained(tmpdirname, output_loading_info=True)
self.assertEqual(info["missing_keys"], [])
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
model = FastSpeech2ConformerWithHifiGan(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = [
"input_ids",
"attention_mask",
"spectrogram_labels",
"duration_labels",
"pitch_labels",
"energy_labels",
"speaker_ids",
"lang_ids",
"speaker_embedding",
"return_dict",
"output_attentions",
"output_hidden_states",
]
self.assertListEqual(arg_names, expected_arg_names)
# Override as FastSpeech2Conformer does not output cross attentions
def test_retain_grad_hidden_states_attentions(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.model_config.output_hidden_states = True
config.model_config.output_attentions = True
model = FastSpeech2ConformerWithHifiGan(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel)
outputs = model(**inputs)
output = outputs[0]
encoder_hidden_states = outputs.encoder_hidden_states[0]
encoder_hidden_states.retain_grad()
decoder_hidden_states = outputs.decoder_hidden_states[0]
decoder_hidden_states.retain_grad()
encoder_attentions = outputs.encoder_attentions[0]
encoder_attentions.retain_grad()
decoder_attentions = outputs.decoder_attentions[0]
decoder_attentions.retain_grad()
output.flatten()[0].backward(retain_graph=True)
self.assertIsNotNone(encoder_hidden_states.grad)
self.assertIsNotNone(decoder_hidden_states.grad)
self.assertIsNotNone(encoder_attentions.grad)
self.assertIsNotNone(decoder_attentions.grad)
def test_attention_outputs(self):
"""
Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable
decoder attention shape, and uniquely outputs energy, pitch, and durations.
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.model_config.return_dict = True
seq_len = self.model_tester.seq_length
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.model_config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.model_config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
encoder_attentions = outputs.encoder_attentions
self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(encoder_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
out_len = len(outputs)
correct_outlen = 8
self.assertEqual(out_len, correct_outlen)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
added_hidden_states = 2
self.assertEqual(out_len + added_hidden_states, len(outputs))
self_attentions = outputs.encoder_attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
@slow
def test_model_from_pretrained(self):
model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
self.assertIsNotNone(model)
@unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="FastSpeech2Conformer has no input embeddings")
def test_model_common_attributes(self):
pass
@require_torch
@require_g2p_en
@slow
class FastSpeech2ConformerWithHifiGanIntegrationTest(unittest.TestCase):
def test_inference_integration(self):
model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan")
model.to(torch_device)
model.eval()
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
text = "Test that this generates speech"
input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"]
output = model(input_ids)
waveform = output.waveform
# waveform is too large (1, 52480), so only check first 100 elements
# fmt: off
expected_waveform = torch.tensor(
[
[-9.6345e-04, 1.3557e-03, 5.7559e-04, 2.4706e-04, 2.2675e-04, 1.2258e-04, 4.7784e-04, 1.0109e-03, -1.9718e-04, 6.3495e-04, 3.2106e-04, 6.3620e-05, 9.1713e-04, -2.5664e-05, 1.9596e-04, 6.0418e-04, 8.1112e-04, 3.6342e-04, -6.3396e-04, -2.0146e-04, -1.1768e-04, 4.3155e-04, 7.5599e-04, -2.2972e-04, -9.5665e-05, 3.3078e-04, 1.3793e-04, -1.4932e-04, -3.9645e-04, 3.6473e-05, -1.7224e-04, -4.5370e-05, -4.8950e-04, -4.3059e-04, 1.0451e-04, -1.0485e-03, -6.0410e-04, 1.6990e-04, -2.1997e-04, -3.8769e-04, -7.6898e-04, -3.2372e-04, -1.9783e-04, 5.2896e-05, -1.0586e-03, -7.8516e-04, 7.6867e-04, -8.5331e-05, -4.8158e-04, -4.5362e-05, -1.0770e-04, 6.6823e-04, 3.0765e-04, 3.3669e-04, 9.5677e-04, 1.0458e-03, 5.8129e-04, 3.3737e-04, 1.0816e-03, 7.0346e-04, 4.2378e-04, 4.3131e-04, 2.8095e-04, 1.2201e-03, 5.6121e-04, -1.1086e-04, 4.9908e-04, 1.5586e-04, 4.2046e-04, -2.8088e-04, -2.2462e-04, -1.5539e-04, -7.0126e-04, -2.8577e-04, -3.3693e-04, -1.2471e-04, -6.9104e-04, -1.2867e-03, -6.2651e-04, -2.5586e-04, -1.3201e-04, -9.4537e-04, -4.8438e-04, 4.1458e-04, 6.4109e-04, 1.0891e-04, -6.3764e-04, 4.5573e-04, 8.2974e-04, 3.2973e-06, -3.8274e-04, -2.0400e-04, 4.9922e-04, 2.1508e-04, -1.1009e-04, -3.9763e-05, 3.0576e-04, 3.1485e-05, -2.7574e-05, 3.3856e-04],
],
device=torch_device,
)
# fmt: on
self.assertTrue(torch.allclose(waveform[0, :100], expected_waveform, atol=1e-4))
self.assertEqual(waveform.shape, (1, 52480))
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the FastSpeech2Conformer tokenizer."""
import unittest
from transformers.models.fastspeech2_conformer import FastSpeech2ConformerTokenizer
from transformers.testing_utils import require_g2p_en, slow
from ...test_tokenization_common import TokenizerTesterMixin
@require_g2p_en
class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FastSpeech2ConformerTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
tokenizer.save_pretrained(self.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "this is a test"
output_text = "this is a test"
return input_text, output_text
# Custom `get_clean_sequence` since FastSpeech2ConformerTokenizer can't decode id -> string
def get_clean_sequence(self, tokenizer, with_prefix_space=False, **kwargs): # max_length=20, min_length=5
input_text, output_text = self.get_input_output_texts(tokenizer)
ids = tokenizer.encode(output_text, add_special_tokens=False)
return output_text, ids
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
token = "<unk>"
token_id = 1
self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
def test_get_vocab(self):
vocab_keys = list(self.get_tokenizer().get_vocab().keys())
self.assertEqual(vocab_keys[0], "<blank>")
self.assertEqual(vocab_keys[1], "<unk>")
self.assertEqual(vocab_keys[-4], "UH0")
self.assertEqual(vocab_keys[-2], "..")
self.assertEqual(vocab_keys[-1], "<sos/eos>")
self.assertEqual(len(vocab_keys), 78)
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 78)
@unittest.skip(
"FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
)
def test_added_token_are_matched_longest_first(self):
pass
@unittest.skip(
"FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
)
def test_added_tokens_do_lower_case(self):
pass
@unittest.skip(
"FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
)
def test_tokenize_special_tokens(self):
pass
def test_full_tokenizer(self):
tokenizer = self.get_tokenizer()
tokens = tokenizer.tokenize("This is a test")
ids = [9, 12, 6, 12, 11, 2, 4, 15, 6, 4, 77]
self.assertListEqual(tokens, ["DH", "IH1", "S", "IH1", "Z", "AH0", "T", "EH1", "S", "T", "<sos/eos>"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), ids)
self.assertListEqual(tokenizer.convert_ids_to_tokens(ids), tokens)
@slow
def test_tokenizer_integration(self):
# Custom test since:
# 1) This tokenizer only decodes to tokens (phonemes cannot be converted to text with complete accuracy)
# 2) Uses a sequence without numbers since espnet has different, custom number conversion.
# This tokenizer can phonemize numbers, but where in espnet "32" is phonemized as "thirty two",
# here "32" is phonemized as "thirty-two" because we haven't implemented the custom number handling.
sequences = [
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
"general-purpose architectures (BERT, GPT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
"Language Understanding (NLU) and Natural Language Generation (NLG) with over thirty-two pretrained "
"models in one hundred plus languages and deep interoperability between Jax, PyTorch and TensorFlow.",
"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
"conditioning on both left and right context in all layers.",
"The quick brown fox jumps over the lazy dog.",
]
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained(
"espnet/fastspeech2_conformer", revision="07f9c4a2d6bbc69b277d87d2202ad1e35b05e113"
)
actual_encoding = tokenizer(sequences)
# fmt: off
expected_encoding = {
'input_ids': [
[4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 22, 30, 7, 14, 21, 8, 29, 3, 34, 3, 18, 11, 17, 12, 4, 21, 10, 4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 2, 3, 5, 17, 12, 4, 21, 10, 17, 7, 29, 4, 7, 31, 3, 5, 25, 38, 4, 17, 7, 2, 20, 32, 5, 11, 40, 15, 3, 21, 2, 8, 17, 38, 17, 2, 6, 24, 7, 10, 2, 4, 45, 10, 39, 21, 11, 25, 38, 4, 23, 37, 15, 4, 6, 23, 7, 2, 25, 38, 4, 2, 23, 11, 8, 15, 14, 11, 23, 5, 13, 6, 4, 12, 8, 4, 21, 25, 23, 11, 8, 15, 3, 39, 2, 8, 1, 22, 30, 7, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 62, 3, 5, 21, 6, 4, 18, 3, 5, 13, 36, 3, 8, 28, 2, 3, 5, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 40, 45, 3, 21, 31, 35, 2, 3, 15, 8, 36, 16, 12, 9, 34, 20, 21, 43, 38, 5, 29, 4, 28, 17, 7, 29, 4, 7, 31, 3, 5, 14, 24, 5, 2, 8, 11, 13, 3, 16, 19, 3, 26, 19, 3, 5, 7, 2, 5, 17, 8, 19, 6, 8, 18, 36, 37, 16, 2, 40, 2, 11, 2, 3, 5, 5, 27, 17, 49, 3, 4, 21, 2, 17, 21, 25, 12, 8, 2, 4, 29, 25, 13, 4, 16, 27, 3, 40, 18, 10, 6, 23, 17, 12, 4, 21, 10, 2, 3, 5, 4, 15, 3, 6, 21, 8, 46, 22, 33, 77],
[25, 38, 4, 12, 11, 5, 13, 11, 32, 3, 5, 4, 28, 17, 7, 27, 4, 7, 31, 3, 5, 27, 17, 25, 51, 5, 13, 7, 15, 10, 35, 2, 3, 2, 8, 7, 45, 17, 7, 2, 11, 2, 3, 4, 31, 35, 2, 3, 11, 22, 7, 19, 14, 2, 3, 8, 31, 25, 2, 8, 5, 4, 15, 10, 6, 4, 25, 32, 40, 55, 3, 4, 8, 29, 10, 2, 3, 5, 12, 35, 2, 3, 13, 36, 24, 3, 25, 34, 43, 8, 15, 22, 4, 2, 3, 5, 7, 32, 4, 10, 24, 3, 4, 54, 10, 6, 4, 13, 3, 30, 8, 8, 31, 21, 11, 33, 77],
[9, 2, 10, 16, 12, 10, 25, 7, 42, 3, 22, 24, 10, 6, 40, 19, 14, 17, 6, 34, 20, 21, 9, 2, 8, 31, 11, 29, 5, 30, 37, 33, 77]
],
'attention_mask': [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
]
}
# fmt: on
actual_tokens = [tokenizer.decode(input_ids) for input_ids in expected_encoding["input_ids"]]
expected_tokens = [
[tokenizer.convert_ids_to_tokens(id) for id in sequence] for sequence in expected_encoding["input_ids"]
]
self.assertListEqual(actual_encoding["input_ids"], expected_encoding["input_ids"])
self.assertListEqual(actual_encoding["attention_mask"], expected_encoding["attention_mask"])
self.assertTrue(actual_tokens == expected_tokens)
@unittest.skip(
reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
)
def test_add_tokens_tokenizer(self):
pass
@unittest.skip(
reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
)
def test_add_special_tokens(self):
pass
@unittest.skip(
reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
)
def test_added_token_serializable(self):
pass
@unittest.skip(
reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
)
def test_save_and_load_tokenizer(self):
pass
@unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
def test_internal_consistency(self):
pass
@unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
def test_encode_decode_with_spaces(self):
pass
@unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
def test_convert_tokens_to_string_format(self):
pass
@unittest.skip("FastSpeech2Conformer tokenizer does not support pairs.")
def test_maximum_encoding_length_pair_input(self):
pass
@unittest.skip(
"FastSpeech2Conformer tokenizer appends eos_token to each string it's passed, including `is_split_into_words=True`."
)
def test_pretokenized_inputs(self):
pass
@unittest.skip(
reason="g2p_en is slow is with large inputs and max encoding length is not a concern for FastSpeech2Conformer"
)
def test_maximum_encoding_length_single_input(self):
pass
...@@ -123,6 +123,7 @@ SPECIAL_CASES_TO_ALLOW.update( ...@@ -123,6 +123,7 @@ SPECIAL_CASES_TO_ALLOW.update(
"DinatConfig": True, "DinatConfig": True,
"DonutSwinConfig": True, "DonutSwinConfig": True,
"EfficientFormerConfig": True, "EfficientFormerConfig": True,
"FastSpeech2ConformerConfig": True,
"FSMTConfig": True, "FSMTConfig": True,
"JukeboxConfig": True, "JukeboxConfig": True,
"LayoutLMv2Config": True, "LayoutLMv2Config": True,
......
...@@ -90,6 +90,8 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ ...@@ -90,6 +90,8 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
"UMT5EncoderModel", # Building part of bigger (tested) model. "UMT5EncoderModel", # Building part of bigger (tested) model.
"Blip2QFormerModel", # Building part of bigger (tested) model. "Blip2QFormerModel", # Building part of bigger (tested) model.
"ErnieMForInformationExtraction", "ErnieMForInformationExtraction",
"FastSpeech2ConformerHifiGan", # Already tested by SpeechT5HifiGan (# Copied from)
"FastSpeech2ConformerWithHifiGan", # Built with two smaller (tested) models.
"GraphormerDecoderHead", # Building part of bigger (tested) model. "GraphormerDecoderHead", # Building part of bigger (tested) model.
"JukeboxVQVAE", # Building part of bigger (tested) model. "JukeboxVQVAE", # Building part of bigger (tested) model.
"JukeboxPrior", # Building part of bigger (tested) model. "JukeboxPrior", # Building part of bigger (tested) model.
...@@ -159,6 +161,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ ...@@ -159,6 +161,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"Blip2QFormerModel", "Blip2QFormerModel",
"Blip2VisionModel", "Blip2VisionModel",
"ErnieMForInformationExtraction", "ErnieMForInformationExtraction",
"FastSpeech2ConformerHifiGan",
"FastSpeech2ConformerWithHifiGan",
"GitVisionModel", "GitVisionModel",
"GraphormerModel", "GraphormerModel",
"GraphormerForGraphClassification", "GraphormerForGraphClassification",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment