Unverified Commit 9f3aa46f authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Add Unispeech & Unispeech-SAT (#13963)



* unispeech

* add copy from

* remove hubert copy from

* finish for today

* add unispeech-sat

* adapt more

* up

* up

* up

* up

* add modeling

* add tests

* up

* up

* finish

* up

* Apply suggestions from code review

* up

* up

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* up

* up
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 9799f4e1
...@@ -1331,9 +1331,8 @@ class SEWDModel(SEWDPreTrainedModel): ...@@ -1331,9 +1331,8 @@ class SEWDModel(SEWDPreTrainedModel):
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)[ mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
:, None mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0 hidden_states[mask_feature_indices] = 0
return hidden_states return hidden_states
......
...@@ -223,9 +223,12 @@ class Speech2TextAttention(nn.Module): ...@@ -223,9 +223,12 @@ class Speech2TextAttention(nn.Module):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = dropout self.dropout = dropout
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert (
self.head_dim * num_heads == self.embed_dim if (self.head_dim * num_heads) != self.embed_dim:
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
......
...@@ -164,9 +164,12 @@ class Speech2Text2Attention(nn.Module): ...@@ -164,9 +164,12 @@ class Speech2Text2Attention(nn.Module):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = dropout self.dropout = dropout
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert (
self.head_dim * num_heads == self.embed_dim if (self.head_dim * num_heads) != self.embed_dim:
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
_import_structure = {
"configuration_unispeech": ["UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP", "UniSpeechConfig"],
}
if is_torch_available():
_import_structure["modeling_unispeech"] = [
"UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST",
"UniSpeechForCTC",
"UniSpeechForPreTraining",
"UniSpeechForSequenceClassification",
"UniSpeechModel",
"UniSpeechPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
if is_torch_available():
from .modeling_unispeech import (
UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST,
UniSpeechForCTC,
UniSpeechForPreTraining,
UniSpeechForSequenceClassification,
UniSpeechModel,
UniSpeechPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" UniSpeech model configuration """
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/unispeech-base-960h": "https://huggingface.co/facebook/unispeech-base-960h/resolve/main/config.json",
# See all UniSpeech models at https://huggingface.co/models?filter=unispeech
}
class UniSpeechConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.UniSpeechModel`. It is used
to instantiate an UniSpeech model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the UniSpeech
`facebook/unispeech-base-960h <https://huggingface.co/facebook/unispeech-base-960h>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 32):
Vocabulary size of the UniSpeech model. Defines the number of different tokens that can be represented by
the :obj:`inputs_ids` passed when calling :class:`~transformers.UniSpeechModel`. Vocabulary size of the
model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
method of :class:`~transformers.UniSpeechModel`.
hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
final_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for the final projection layer of :class:`UniSpeechForCTC`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probability for output of the feature extractor.
feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states.
conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
`conv_dim`.
conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether the 1D convolutional layers have a bias.
num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
embeddings layer.
num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
Number of groups of 1D convolutional positional embeddings layer.
do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
False`` corresponds to applying layer norm after the attention layer.
apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis.
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis.
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
Number of codevector groups for product codevector quantization.
contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
The temperature `kappa` in the contrastive loss.
feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
num_negatives (:obj:`int`, `optional`, defaults to 100):
Number of negative samples for the contrastive loss.
codevector_dim (:obj:`int`, `optional`, defaults to 256):
Dimensionality of the quantized feature vectors.
proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
Dimensionality of the final projection of both the quantized and the transformer features.
diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
The weight of the codebook diversity loss component.
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
instance of :class:`~transformers.UniSpeechForCTC`.
ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
instance of :class:`~transformers.UniSpeechForCTC`.
use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
instance of :class:`~transformers.UniSpeechForSequenceClassification`.
classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
Dimensionality of the projection before token mean-pooling for classification.
replace_prob (:obj:`float`, `optional`, defaults to 0.5):
Propability that transformer feature is replaced by quantized feature for pretraining.
Example::
>>> from transformers import UniSpeechModel, UniSpeechConfig
>>> # Initializing a UniSpeech facebook/unispeech-base-960h style configuration
>>> configuration = UniSpeechConfig()
>>> # Initializing a model from the facebook/unispeech-base-960h style configuration
>>> model = UniSpeechModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "unispeech"
def __init__(
self,
vocab_size=32,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout=0.1,
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_dropout=0.0,
feat_quantizer_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
feat_extract_norm="group",
feat_extract_activation="gelu",
conv_dim=(512, 512, 512, 512, 512, 512, 512),
conv_stride=(5, 2, 2, 2, 2, 2, 2),
conv_kernel=(10, 3, 3, 3, 3, 2, 2),
conv_bias=False,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
do_stable_layer_norm=False,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_feature_prob=0.0,
mask_feature_length=10,
num_codevectors_per_group=320,
num_codevector_groups=2,
contrastive_logits_temperature=0.1,
num_negatives=100,
codevector_dim=256,
proj_codevector_dim=256,
diversity_loss_weight=0.1,
ctc_loss_reduction="mean",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=256,
num_ctc_classes=80,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
replace_prob=0.5,
**kwargs
):
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
self.hidden_size = hidden_size
self.feat_extract_norm = feat_extract_norm
self.feat_extract_activation = feat_extract_activation
self.conv_dim = list(conv_dim)
self.conv_stride = list(conv_stride)
self.conv_kernel = list(conv_kernel)
self.conv_bias = conv_bias
self.num_conv_pos_embeddings = num_conv_pos_embeddings
self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
self.num_feat_extract_layers = len(self.conv_dim)
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.num_attention_heads = num_attention_heads
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.feat_proj_dropout = feat_proj_dropout
self.final_dropout = final_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.num_ctc_classes = num_ctc_classes
self.vocab_size = vocab_size
self.do_stable_layer_norm = do_stable_layer_norm
self.use_weighted_layer_sum = use_weighted_layer_sum
self.classifier_proj_size = classifier_proj_size
if (
(len(self.conv_stride) != self.num_feat_extract_layers)
or (len(self.conv_kernel) != self.num_feat_extract_layers)
or (len(self.conv_dim) != self.num_feat_extract_layers)
):
raise ValueError(
"Configuration for convolutional layers is incorrect. "
"It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
)
# fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length
self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length
# parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group
self.num_codevector_groups = num_codevector_groups
self.contrastive_logits_temperature = contrastive_logits_temperature
self.feat_quantizer_dropout = feat_quantizer_dropout
self.num_negatives = num_negatives
self.codevector_dim = codevector_dim
self.proj_codevector_dim = proj_codevector_dim
self.diversity_loss_weight = diversity_loss_weight
# ctc loss
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
# pretraining loss
self.replace_prob = replace_prob
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert UniSpeech checkpoint."""
import argparse
import fairseq
import torch
from transformers import UniSpeechConfig, UniSpeechForPreTraining, logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"quantizer.weight_proj": "quantizer.weight_proj",
"quantizer.vars": "quantizer.codevectors",
"project_q": "project_q",
"final_proj": "project_hid",
"w2v_encoder.proj": "ctc_proj",
"mask_emb": "masked_spec_embed",
}
TOP_LEVEL_KEYS = [
"ctc_proj",
"quantizer.weight_proj",
"quantizer.codevectors",
"project_q",
"project_hid",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert (
hf_shape == value.shape
), f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}"
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.unispeech.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
mapped_key = "unispeech." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name:
weight_type = "bias"
elif "weight" in name:
# TODO: don't match quantizer.weight_proj
weight_type = "weight"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was found."
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
@torch.no_grad()
def convert_unispeech_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = UniSpeechConfig.from_pretrained(config_path)
else:
config = UniSpeechConfig()
hf_unispeech = UniSpeechForPreTraining(config)
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
model = model[0].eval()
recursively_load_weights(model, hf_unispeech)
hf_unispeech.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
args = parser.parse_args()
convert_unispeech_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
This diff is collapsed.
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
_import_structure = {
"configuration_unispeech_sat": ["UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "UniSpeechSatConfig"],
}
if is_torch_available():
_import_structure["modeling_unispeech_sat"] = [
"UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST",
"UniSpeechSatForCTC",
"UniSpeechSatForPreTraining",
"UniSpeechSatForSequenceClassification",
"UniSpeechSatModel",
"UniSpeechSatPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
if is_torch_available():
from .modeling_unispeech_sat import (
UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST,
UniSpeechSatForCTC,
UniSpeechSatForPreTraining,
UniSpeechSatForSequenceClassification,
UniSpeechSatModel,
UniSpeechSatPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" UniSpeechSat model configuration """
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/unispeech_sat-base-960h": "https://huggingface.co/facebook/unispeech_sat-base-960h/resolve/main/config.json",
# See all UniSpeechSat models at https://huggingface.co/models?filter=unispeech_sat
}
class UniSpeechSatConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.UniSpeechSatModel`. It is
used to instantiate an UniSpeechSat model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the UniSpeechSat
`facebook/unispeech_sat-base-960h <https://huggingface.co/facebook/unispeech_sat-base-960h>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 32):
Vocabulary size of the UniSpeechSat model. Defines the number of different tokens that can be represented
by the :obj:`inputs_ids` passed when calling :class:`~transformers.UniSpeechSatModel`. Vocabulary size of
the model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
method of :class:`~transformers.UniSpeechSatModel`.
hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
final_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for the final projection layer of :class:`UniSpeechSatForCTC`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probability for output of the feature extractor.
feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states.
conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
`conv_dim`.
conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether the 1D convolutional layers have a bias.
num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
embeddings layer.
num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
Number of groups of 1D convolutional positional embeddings layer.
do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
False`` corresponds to applying layer norm after the attention layer.
apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis.
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis.
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
Number of codevector groups for product codevector quantization.
contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
The temperature `kappa` in the contrastive loss.
feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
num_negatives (:obj:`int`, `optional`, defaults to 100):
Number of negative samples for the contrastive loss.
codevector_dim (:obj:`int`, `optional`, defaults to 256):
Dimensionality of the quantized feature vectors.
proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
Dimensionality of the final projection of both the quantized and the transformer features.
diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
The weight of the codebook diversity loss component.
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
instance of :class:`~transformers.UniSpeechSatForCTC`.
ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
instance of :class:`~transformers.UniSpeechSatForCTC`.
use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
instance of :class:`~transformers.UniSpeechSatForSequenceClassification`.
classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
Dimensionality of the projection before token mean-pooling for classification.
Example::
>>> from transformers import UniSpeechSatModel, UniSpeechSatConfig
>>> # Initializing a UniSpeechSat facebook/unispeech_sat-base-960h style configuration
>>> configuration = UniSpeechSatConfig()
>>> # Initializing a model from the facebook/unispeech_sat-base-960h style configuration
>>> model = UniSpeechSatModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "unispeech-sat"
def __init__(
self,
vocab_size=32,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout=0.1,
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_dropout=0.0,
feat_quantizer_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
feat_extract_norm="group",
feat_extract_activation="gelu",
conv_dim=(512, 512, 512, 512, 512, 512, 512),
conv_stride=(5, 2, 2, 2, 2, 2, 2),
conv_kernel=(10, 3, 3, 3, 3, 2, 2),
conv_bias=False,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
do_stable_layer_norm=False,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_feature_prob=0.0,
mask_feature_length=10,
num_codevectors_per_group=320,
num_codevector_groups=2,
contrastive_logits_temperature=0.1,
num_negatives=100,
codevector_dim=256,
proj_codevector_dim=256,
diversity_loss_weight=0.1,
ctc_loss_reduction="mean",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=256,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
num_clusters=504,
**kwargs
):
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
self.hidden_size = hidden_size
self.feat_extract_norm = feat_extract_norm
self.feat_extract_activation = feat_extract_activation
self.conv_dim = list(conv_dim)
self.conv_stride = list(conv_stride)
self.conv_kernel = list(conv_kernel)
self.conv_bias = conv_bias
self.num_conv_pos_embeddings = num_conv_pos_embeddings
self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
self.num_feat_extract_layers = len(self.conv_dim)
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.num_attention_heads = num_attention_heads
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.feat_proj_dropout = feat_proj_dropout
self.final_dropout = final_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.vocab_size = vocab_size
self.num_clusters = num_clusters
self.do_stable_layer_norm = do_stable_layer_norm
self.use_weighted_layer_sum = use_weighted_layer_sum
self.classifier_proj_size = classifier_proj_size
if (
(len(self.conv_stride) != self.num_feat_extract_layers)
or (len(self.conv_kernel) != self.num_feat_extract_layers)
or (len(self.conv_dim) != self.num_feat_extract_layers)
):
raise ValueError(
"Configuration for convolutional layers is incorrect. "
"It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
)
# fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length
self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length
# parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group
self.num_codevector_groups = num_codevector_groups
self.contrastive_logits_temperature = contrastive_logits_temperature
self.feat_quantizer_dropout = feat_quantizer_dropout
self.num_negatives = num_negatives
self.codevector_dim = codevector_dim
self.proj_codevector_dim = proj_codevector_dim
self.diversity_loss_weight = diversity_loss_weight
# ctc loss
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert UniSpeechSat checkpoint."""
import argparse
import fairseq
import torch
from transformers import ( # UniSpeechSatCTCTokenizer,; UniSpeechSatFeatureExtractor,; UniSpeechSatProcessor,
UniSpeechSatConfig,
UniSpeechSatForCTC,
UniSpeechSatForPreTraining,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"encoder.layer_norm_for_extract": "layer_norm_for_extract",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"quantizer.weight_proj": "quantizer.weight_proj",
"quantizer.vars": "quantizer.codevectors",
"project_q": "project_q",
"final_proj": "project_hid",
"w2v_encoder.proj": "lm_head",
"label_embs_concat": "label_embeddings_concat",
"mask_emb": "masked_spec_embed",
"spk_proj": "speaker_proj",
}
TOP_LEVEL_KEYS = [
"lm_head",
"quantizer.weight_proj",
"quantizer.codevectors",
"project_q",
"project_hid",
"label_embeddings_concat",
"speaker_proj",
"layer_norm_for_extract",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert (
hf_shape == value.shape
), f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}"
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.unispeech_sat.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
mapped_key = "unispeech_sat." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
if "layer_norm_for_extract" in name and (".".join(name.split(".")[:-1]) != key):
# special case since naming is very similar
continue
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name:
weight_type = "bias"
elif "weight" in name:
# TODO: don't match quantizer.weight_proj
weight_type = "weight"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was found."
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
@torch.no_grad()
def convert_unispeech_sat_checkpoint(
checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = UniSpeechSatConfig.from_pretrained(config_path)
else:
config = UniSpeechSatConfig()
dict_path = ""
if is_finetuned:
hf_wav2vec = UniSpeechSatForCTC(config)
else:
hf_wav2vec = UniSpeechSatForPreTraining(config)
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
)
model = model[0].eval()
recursively_load_weights(model, hf_wav2vec)
hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args()
convert_unispeech_sat_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
)
...@@ -46,7 +46,7 @@ _CONFIG_FOR_DOC = "Wav2Vec2Config" ...@@ -46,7 +46,7 @@ _CONFIG_FOR_DOC = "Wav2Vec2Config"
_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h" _CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
_PROCESSOR_FOR_DOC = "Wav2Vec2Processor" _PROCESSOR_FOR_DOC = "Wav2Vec2Processor"
_SEQ_CLASS_CHECKPOINT = ("superb/wav2vec2-base-superb-ks",) _SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks"
_SEQ_CLASS_PROCESSOR_FOR_DOC = "Wav2Vec2FeatureExtractor" _SEQ_CLASS_PROCESSOR_FOR_DOC = "Wav2Vec2FeatureExtractor"
_HIDDEN_STATES_START_POSITION = 2 _HIDDEN_STATES_START_POSITION = 2
...@@ -462,9 +462,12 @@ class Wav2Vec2Attention(nn.Module): ...@@ -462,9 +462,12 @@ class Wav2Vec2Attention(nn.Module):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = dropout self.dropout = dropout
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert (
self.head_dim * num_heads == self.embed_dim if (self.head_dim * num_heads) != self.embed_dim:
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
...@@ -858,9 +861,11 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module): ...@@ -858,9 +861,11 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module):
self.num_groups = config.num_codevector_groups self.num_groups = config.num_codevector_groups
self.num_vars = config.num_codevectors_per_group self.num_vars = config.num_codevectors_per_group
assert ( if config.codevector_dim % self.num_groups != 0:
config.codevector_dim % self.num_groups == 0 raise ValueError(
), f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups` {self.num_groups} for concatenation" f"`config.codevector_dim {config.codevector_dim} must be divisible "
f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
)
# storage for codebook variables (codewords) # storage for codebook variables (codewords)
self.codevectors = nn.Parameter( self.codevectors = nn.Parameter(
...@@ -871,9 +876,6 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module): ...@@ -871,9 +876,6 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module):
# can be decayed for training # can be decayed for training
self.temperature = 2 self.temperature = 2
def set_temperature(self, temperature: int):
self.temperature = temperature
@staticmethod @staticmethod
def _compute_perplexity(probs, mask=None): def _compute_perplexity(probs, mask=None):
if mask is not None: if mask is not None:
...@@ -1118,9 +1120,8 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): ...@@ -1118,9 +1120,8 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)[ mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
:, None mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0 hidden_states[mask_feature_indices] = 0
return hidden_states return hidden_states
...@@ -1200,7 +1201,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): ...@@ -1200,7 +1201,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
""" """
Set the Gumbel softmax temperature to a given value. Only necessary for training Set the Gumbel softmax temperature to a given value. Only necessary for training
""" """
return self.quantizer.set_temperature(temperature) self.quantizer.temperature = temperature
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
......
...@@ -3626,6 +3626,86 @@ class TrOCRPreTrainedModel: ...@@ -3626,6 +3626,86 @@ class TrOCRPreTrainedModel:
requires_backends(cls, ["torch"]) requires_backends(cls, ["torch"])
UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = None
class UniSpeechForCTC:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class UniSpeechForPreTraining:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class UniSpeechForSequenceClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class UniSpeechModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class UniSpeechPreTrainedModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class UniSpeechSatForCTC:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class UniSpeechSatForPreTraining:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class UniSpeechSatForSequenceClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class UniSpeechSatModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class UniSpeechSatPreTrainedModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class VisionEncoderDecoderModel: class VisionEncoderDecoderModel:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment