Unverified Commit d2cdefb9 authored by Yoach Lacombe's avatar Yoach Lacombe Committed by GitHub
Browse files

Add new meta w2v2-conformer BERT-like model (#28165)



* first commit

* correct default value non causal

* update config and modeling code

* update converting checkpoint

* clean modeling and fix tests

* make style

* add new config parameters to docstring

* fix copied from statements

* Apply suggestions from code review
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* make position_embeddings_type docstrings clearer

* clean converting script

* remove function not used

* clean modeling file

* apply suggestion for test file + add convert script to not_doctested

* modify tests according to review - cleaner logic and more tests

* Apply nit suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* add checker of valid position embeddings type

* instantiate new layer norm layer with the right eps

* fix freeze_feature_encoder since it can be None in some cases

* add test same output in convert script

* restore wav2vec2conformer and add new model

* create processor and FE + clean

* add new model code

* fix convert script and set default config parameters

* correct model id paths

* make style

* make fix-copies and cleaning files

* fix copied from statements

* complete .md and fixe copies

* clean convert script argument defaults

* fix config parameters docstrings

* fix config docstring

* add copied from and enrich FE tests

* fix copied from and repo-consistency

* add autotokenizer

* make test input length shorter and change docstring code

* fix docstrings and copied from

* add add_adapter to ASR training example

* make testing of adapters more robust

* adapt to multi adapter layers

* refactor input_values->input_features and remove w2v2-bert feature extractor

* remove pretraining model

* remove depreciated features and useless lines

* add copied from and ignore statements to modeling tests

* remove pretraining model #2

* change import in convert script

* change default in convert script

* update readme and remove useless line

* Update tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* refactor BERT to Bert for consistency

* remove useless ignore copy statement

* add persistent to buffer in rotary

* add eps in LayerNorm init and remove copied from

* add adapter activation parameters and add copied from statements

* Fix copied statements and add unitest.skip reasons

* add copied statement in test_processor

* refactor processor

* make style

* replace numpy random by torch rand

* remove expected output CTC

* improve converting script with processor class

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* remove gumbel class

* remove tests related to previously deleted class

* Update src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* correct typos

* remove uused parameters

* update processor to takes both text and audio

* update checkpoints

* update expected output and add ctc expected output

* add label_attention_mask

* replace pt with np in processor tests

* fix typo

* revert to behaviour with labels_attention_mask

---------
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 5d8eb93e
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_wav2vec2_bert": [
"WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Wav2Vec2BertConfig",
],
"processing_wav2vec2_bert": ["Wav2Vec2BertProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_wav2vec2_bert"] = [
"WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"Wav2Vec2BertForAudioFrameClassification",
"Wav2Vec2BertForCTC",
"Wav2Vec2BertForSequenceClassification",
"Wav2Vec2BertForXVector",
"Wav2Vec2BertModel",
"Wav2Vec2BertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_wav2vec2_bert import (
WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
Wav2Vec2BertConfig,
)
from .processing_wav2vec2_bert import Wav2Vec2BertProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_wav2vec2_bert import (
WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
Wav2Vec2BertForAudioFrameClassification,
Wav2Vec2BertForCTC,
Wav2Vec2BertForSequenceClassification,
Wav2Vec2BertForXVector,
Wav2Vec2BertModel,
Wav2Vec2BertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2024 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Wav2Vec2Bert model configuration"""
import functools
import operator
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/w2v-bert-2.0": "https://huggingface.co/facebook/w2v-bert-2.0/resolve/main/config.json",
}
class Wav2Vec2BertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Wav2Vec2BertModel`]. It is used to
instantiate an Wav2Vec2Bert model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Bert
[facebook/wav2vec2-bert-rel-pos-large](https://huggingface.co/facebook/wav2vec2-bert-rel-pos-large)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*):
Vocabulary size of the Wav2Vec2Bert model. Defines the number of different tokens that can be
represented by the `inputs_ids` passed when calling [`Wav2Vec2BertModel`]. Vocabulary size of the
model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
method of [`Wav2Vec2BertModel`].
hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
feature_projection_input_dim (`int`, *optional*, defaults to 160):
Input dimension of this model, i.e the dimension after processing input audios with [`SeamlessM4TFeatureExtractor`] or [`Wav2Vec2BertProcessor`].
hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
hidden_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the feature projection.
final_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the final projection layer of [`Wav2Vec2BertForCTC`].
layerdrop (`float`, *optional*, defaults to 0.1):
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
details.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
procecure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
mask_time_length (`int`, *optional*, defaults to 10):
Length of vector span along the time axis.
mask_time_min_masks (`int`, *optional*, defaults to 2):
The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
irrespectively of `mask_feature_prob`. Only relevant if `mask_time_prob*len(time_axis)/mask_time_length <
mask_time_min_masks`.
mask_feature_prob (`float`, *optional*, defaults to 0.0):
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
True`.
mask_feature_length (`int`, *optional*, defaults to 10):
Length of vector span along the feature axis.
mask_feature_min_masks (`int`, *optional*, defaults to 0):
The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
step, irrespectively of `mask_feature_prob`. Only relevant if
`mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
instance of [`Wav2Vec2BertForCTC`].
ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
of [`Wav2Vec2BertForCTC`].
use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
instance of [`Wav2Vec2BertForSequenceClassification`].
classifier_proj_size (`int`, *optional*, defaults to 768):
Dimensionality of the projection before token mean-pooling for classification.
tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
*XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
*XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
xvector_output_dim (`int`, *optional*, defaults to 512):
Dimensionality of the *XVector* embedding vectors.
pad_token_id (`int`, *optional*, defaults to 0): The id of the _beginning-of-stream_ token.
bos_token_id (`int`, *optional*, defaults to 1): The id of the _padding_ token.
eos_token_id (`int`, *optional*, defaults to 2): The id of the _end-of-stream_ token.
add_adapter (`bool`, *optional*, defaults to `False`):
Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very
useful for warm-starting Wav2Vec2Bert for SpeechEncoderDecoder models.
adapter_kernel_size (`int`, *optional*, defaults to 3):
Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
adapter_stride (`int`, *optional*, defaults to 2):
Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
num_adapter_layers (`int`, *optional*, defaults to 1):
Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
True`.
adapter_act (`str` or `function`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the adapter layers. If string, `"gelu"`,
`"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
use_intermediate_ffn_before_adapter (`bool`, *optional*, defaults to `False`):
Whether an intermediate feed-forward block should be stacked on top of the Wav2Vec2Bert Encoder and before the adapter network.
Only relevant if `add_adapter is True`.
output_hidden_size (`int`, *optional*):
Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
if `add_adapter is True`.
position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`):
Can be specified to :
- `rotary`, for rotary position embeddings.
- `relative`, for relative position embeddings.
- `relative_key`, for relative position embeddings as defined by Shaw in [Self-Attention
with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
If left to `None`, no relative position embeddings is applied.
rotary_embedding_base (`int`, *optional*, defaults to 10000):
If `"rotary"` position embeddings are used, defines the size of the embedding base.
max_source_positions (`int`, *optional*, defaults to 5000):
if `"relative"` position embeddings are used, defines the maximum source input positions.
left_max_position_embeddings (`int`, *optional*, defaults to 64):
If `"relative_key"` (aka Shaw) position embeddings are used, defines the left clipping value for relative positions.
right_max_position_embeddings (`int`, *optional*, defaults to 8):
If `"relative_key"` (aka Shaw) position embeddings are used, defines the right clipping value for relative positions.
conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
Kernel size of convolutional depthwise 1D layer in Conformer blocks.
conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all convolutional layers in Conformer blocks.
Example:
```python
>>> from transformers import Wav2Vec2BertConfig, Wav2Vec2BertModel
>>> # Initializing a Wav2Vec2Bert facebook/wav2vec2-bert-rel-pos-large style configuration
>>> configuration = Wav2Vec2BertConfig()
>>> # Initializing a model (with random weights) from the facebook/wav2vec2-bert-rel-pos-large style configuration
>>> model = Wav2Vec2BertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "wav2vec2-bert"
def __init__(
self,
vocab_size=None,
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
intermediate_size=4096,
feature_projection_input_dim=160,
hidden_act="swish",
hidden_dropout=0.0,
activation_dropout=0.0,
attention_dropout=0.0,
feat_proj_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="sum",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=768,
tdnn_dim=(512, 512, 512, 512, 1500),
tdnn_kernel=(5, 3, 3, 1, 1),
tdnn_dilation=(1, 2, 3, 1, 1),
xvector_output_dim=512,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
add_adapter=False,
adapter_kernel_size=3,
adapter_stride=2,
num_adapter_layers=1,
adapter_act="relu",
use_intermediate_ffn_before_adapter=False,
output_hidden_size=None,
position_embeddings_type="relative_key",
rotary_embedding_base=10000,
max_source_positions=5000,
left_max_position_embeddings=64,
right_max_position_embeddings=8,
conv_depthwise_kernel_size=31,
conformer_conv_dropout=0.1,
**kwargs,
):
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.num_attention_heads = num_attention_heads
self.feature_projection_input_dim = feature_projection_input_dim
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.feat_proj_dropout = feat_proj_dropout
self.final_dropout = final_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.vocab_size = vocab_size
self.use_weighted_layer_sum = use_weighted_layer_sum
self.max_source_positions = max_source_positions
if position_embeddings_type is not None and position_embeddings_type not in [
"rotary",
"relative",
"relative_key",
]:
raise ValueError(
"""
`position_embeddings_type` is not valid. It must be one of the following values:
`["rotary", "relative", "relative_key"]` or left as `None`.
"""
)
self.position_embeddings_type = position_embeddings_type
self.rotary_embedding_base = rotary_embedding_base
self.left_max_position_embeddings = left_max_position_embeddings
self.right_max_position_embeddings = right_max_position_embeddings
# Conformer-block related
self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
self.conformer_conv_dropout = conformer_conv_dropout
# fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# ctc loss
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
# adapter
self.add_adapter = add_adapter
self.adapter_kernel_size = adapter_kernel_size
self.adapter_stride = adapter_stride
self.num_adapter_layers = num_adapter_layers
self.adapter_act = adapter_act
self.output_hidden_size = output_hidden_size if output_hidden_size is not None else hidden_size
if use_intermediate_ffn_before_adapter and not add_adapter:
raise ValueError("`use_intermediate_ffn_before_adapter` is `True` but `add_adapter` is `False`.")
self.use_intermediate_ffn_before_adapter = use_intermediate_ffn_before_adapter
# SequenceClassification-specific parameter. Feel free to ignore for other classes.
self.classifier_proj_size = classifier_proj_size
# XVector-specific parameters. Feel free to ignore for other classes.
self.tdnn_dim = list(tdnn_dim)
self.tdnn_kernel = list(tdnn_kernel)
self.tdnn_dilation = list(tdnn_dilation)
self.xvector_output_dim = xvector_output_dim
@property
def inputs_to_logits_ratio(self):
return functools.reduce(operator.mul, self.conv_stride, 1)
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Wav2Vec2Bert BERT checkpoint."""
import argparse
import torch
import torchaudio
from fairseq2.data import Collater
from fairseq2.data.audio import WaveformToFbankConverter
from fairseq2.nn.padding import get_seqs_and_padding_mask
from seamless_communication.models.conformer_shaw import load_conformer_shaw_model
from transformers import (
SeamlessM4TFeatureExtractor,
Wav2Vec2BertConfig,
Wav2Vec2BertModel,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
wav2vec_convert_list = [
("encoder_frontend.model_dim_proj", "feature_projection.projection"),
("encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
("encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
("encoder.inner.layers", "encoder.layers"),
("encoder.inner_layer_norm", "encoder.layer_norm"),
("encoder.adaptor_layers", "adapter.layers"),
("inner_proj", "intermediate_dense"),
("self_attn.output_proj", "self_attn.linear_out"),
("output_proj", "output_dense"),
("self_attn.k_proj", "self_attn.linear_k"),
("self_attn.v_proj", "self_attn.linear_v"),
("self_attn.q_proj", "self_attn.linear_q"),
("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
("conv.depthwise_conv", "conv_module.depthwise_conv"),
("conv.layer_norm", "conv_module.depthwise_layer_norm"),
("conv_layer_norm", "conv_module.layer_norm"),
("encoder.proj1", "intermediate_ffn.intermediate_dense"),
("encoder.proj2", "intermediate_ffn.output_dense"),
("encoder.layer_norm", "inner_layer_norm"),
("masker.temporal_mask_embed", "masked_spec_embed"),
]
keys_to_remove = {
"quantizer.entry_proj",
"final_proj",
"final_target_proj",
"quantizer.entries",
"quantizer.num_updates",
}
def param_count(model):
return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
def _convert_model(
original_model,
hf_model,
convert_list,
):
state_dict = original_model.state_dict()
for k, v in list(state_dict.items()):
new_key = k
for old_layer_name, new_layer_name in convert_list:
if old_layer_name in new_key:
new_key = new_key.replace(old_layer_name, new_layer_name)
# must do it by hand
if ".layer_norm" in new_key and new_key.split(".layer_norm")[0][-1].isnumeric():
new_key = new_key.replace("layer_norm", "final_layer_norm")
add_key = True
for key in keys_to_remove:
if key in new_key:
state_dict.pop(k)
add_key = False
break
if add_key:
state_dict[new_key] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
extra_keys = set({k for k in extra_keys if "num_updates" not in k}) # filter unecessary param
missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
raise ValueError(f"missing keys: {missing_keys}")
hf_model.load_state_dict(state_dict, strict=True)
n_params = param_count(hf_model)
logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
hf_model.eval()
del state_dict
return hf_model
@torch.no_grad()
def convert_wav2vec2_bert_checkpoint(
checkpoint_path,
pytorch_dump_folder_path,
config_path=None,
repo_id=None,
):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = Wav2Vec2BertConfig.from_pretrained(config_path, hidden_act="swish")
else:
config = Wav2Vec2BertConfig(apply_spec_augment=False)
hf_wav2vec = Wav2Vec2BertModel(config)
model = load_conformer_shaw_model(checkpoint_path, dtype=torch.float32)
model.eval()
hf_wav2vec = _convert_model(model, hf_wav2vec, wav2vec_convert_list)
hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
if repo_id:
hf_wav2vec.push_to_hub(repo_id, create_pr=True)
# save feature extractor
fe = SeamlessM4TFeatureExtractor(padding_value=1)
fe._set_processor_class("Wav2Vec2BertProcessor")
fe.save_pretrained(pytorch_dump_folder_path)
if repo_id:
fe.push_to_hub(repo_id, create_pr=True)
if args.audio_path:
waveform, sample_rate = torchaudio.load(args.audio_path)
waveform = torchaudio.functional.resample(waveform, sample_rate, fe.sampling_rate)
fbank_converter = WaveformToFbankConverter(
num_mel_bins=80,
waveform_scale=2**15,
channel_last=True,
standardize=True,
dtype=torch.float32,
)
collater = Collater(pad_value=1)
decoded_audio = {"waveform": waveform.T, "sample_rate": fe.sampling_rate, "format": -1}
src = collater(fbank_converter(decoded_audio))["fbank"]
seqs, padding_mask = get_seqs_and_padding_mask(src)
with torch.inference_mode():
seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
original_output, padding_mask = model.encoder(seqs, padding_mask)
hf_wav2vec.eval()
inputs = fe(waveform, return_tensors="pt", padding=True)
with torch.no_grad():
outputs = hf_wav2vec(**inputs)
torch.testing.assert_close(original_output, outputs.last_hidden_state, atol=5e-3, rtol=5e-3)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model.",
)
parser.add_argument(
"--checkpoint_path", default="conformer_shaw", type=str, help="Path to seamless communication checkpoint"
)
parser.add_argument(
"--config_path",
default=None,
type=str,
help="Path to hf config.json of model to convert",
)
parser.add_argument("--repo_id", default=None, type=str, help="Push to this repo id if precised.")
parser.add_argument(
"--audio_path",
default=None,
type=str,
help="If specified, check that the original model and the converted model produce the same outputs.",
)
args = parser.parse_args()
convert_wav2vec2_bert_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.repo_id
)
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Speech processor class for Wav2Vec2-BERT
"""
import warnings
from ...processing_utils import ProcessorMixin
from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
class Wav2Vec2BertProcessor(ProcessorMixin):
r"""
Constructs a Wav2Vec2-BERT processor which wraps a Wav2Vec2-BERT feature extractor and a Wav2Vec2 CTC tokenizer into a single
processor.
[`Wav2Vec2Processor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and [`PreTrainedTokenizer`].
See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
Args:
feature_extractor (`SeamlessM4TFeatureExtractor`):
An instance of [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
tokenizer ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "SeamlessM4TFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
try:
return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
except OSError:
warnings.warn(
f"Loading a tokenizer inside {cls.__name__} from a config that does not"
" include a `tokenizer_class` attribute is deprecated and will be "
"removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
" attribute to either your `config.json` or `tokenizer_config.json` "
"file to suppress this warning: ",
FutureWarning,
)
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, audio=None, text=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
`None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
and T the sample length of the audio.
kwargs (*optional*):
Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
tokenizer.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
- **attention_mask** -- List of indices specifying which timestamps should be attended to by the model when `audio` is not `None`.
When only `text` is specified, returns the token attention mask.
- **labels** -- List of token ids to be fed to a model. Returned when both `text` and `audio` are not `None`.
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `audio` is `None`.
"""
sampling_rate = kwargs.pop("sampling_rate", None)
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def pad(self, input_features=None, labels=None, **kwargs):
"""
If `input_features` is not `None`, this method forwards the `input_features` and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.pad`] to pad the input features.
If `labels` is not `None`, this method forwards the `labels` and `kwargs` arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`] to pad the label(s).
Please refer to the doctsring of the above two methods for more information.
"""
if input_features is None and labels is None:
raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, **kwargs)
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
if labels is None:
return input_features
elif input_features is None:
return labels
else:
input_features["labels"] = labels["input_ids"]
return input_features
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
......@@ -8758,6 +8758,51 @@ class Wav2Vec2PreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class Wav2Vec2BertForAudioFrameClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2BertForCTC(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2BertForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2BertForXVector(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2BertModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2BertPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
This diff is collapsed.
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from transformers.models.seamless_m4t import SeamlessM4TFeatureExtractor
from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
from transformers.models.wav2vec2_bert import Wav2Vec2BertProcessor
from transformers.utils import FEATURE_EXTRACTOR_NAME
from ..wav2vec2.test_feature_extraction_wav2vec2 import floats_list
# Copied from tests.models.wav2vec2.test_processor_wav2vec2.Wav2Vec2ProcessorTest with Wav2Vec2FeatureExtractor->SeamlessM4TFeatureExtractor, Wav2Vec2Processor->Wav2Vec2BertProcessor
class Wav2Vec2BertProcessorTest(unittest.TestCase):
def setUp(self):
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.add_kwargs_tokens_map = {
"pad_token": "<pad>",
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
}
feature_extractor_map = {
"feature_size": 1,
"padding_value": 0.0,
"sampling_rate": 16000,
"return_attention_mask": False,
"do_normalize": True,
}
self.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n")
def get_tokenizer(self, **kwargs_init):
kwargs = self.add_kwargs_tokens_map.copy()
kwargs.update(kwargs_init)
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs):
return SeamlessM4TFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(self.tmpdirname)
processor = Wav2Vec2BertProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, SeamlessM4TFeatureExtractor)
def test_save_load_pretrained_additional_features(self):
processor = Wav2Vec2BertProcessor(
tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
)
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
processor = Wav2Vec2BertProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, SeamlessM4TFeatureExtractor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
raw_speech = floats_list((3, 1000))
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
input_processor = processor(raw_speech, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
input_str = "This is a test string"
encoded_processor = processor(text=input_str)
encoded_tok = tokenizer(input_str)
for key in encoded_tok.keys():
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.assertListEqual(
processor.model_input_names,
feature_extractor.model_input_names,
msg="`processor` and `feature_extractor` model input names do not match",
)
......@@ -762,6 +762,7 @@ OBJECTS_TO_IGNORE = [
"VitMatteForImageMatting",
"VitsTokenizer",
"VivitModel",
"Wav2Vec2BertForCTC",
"Wav2Vec2CTCTokenizer",
"Wav2Vec2Config",
"Wav2Vec2ConformerConfig",
......
......@@ -878,6 +878,7 @@ src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to
src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment