"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "605f3245dcca34381c35520c35ba0b701ed80d58"
Unverified Commit 5a995735 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Add Wav2Vec2Conformer (#16812)



* save intermediate

* add wav2vec2 conformer

* add more code

* more

* first test passes

* make all checkpoints work

* update

* up

* more clean ups

* save clean-up

* save clean-up

* save more

* remove bogus

* finalize design conformer

* remove vision

* finish all tests

* more changes

* finish code

* add doc tests

* add slow tests

* fix autoconfig test

* up

* correct docstring

* up

* update

* fix

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarAnton Lozhkov <aglozhkov@gmail.com>

* Update docs/source/en/model_doc/wav2vec2-conformer.mdx

* upload

* save copied from

* correct configs

* fix model outputs

* add to docs

* fix imports

* finish

* finish code

* correct copied from

* correct again

* correct make fix

* improve make fix copies

* save

* correct fix copy from

* correct init structure

* correct

* fix import

* apply suggestions
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarAnton Lozhkov <aglozhkov@gmail.com>
parent f0395cf5
...@@ -27,7 +27,7 @@ from torch.nn import CrossEntropyLoss ...@@ -27,7 +27,7 @@ from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled from ...deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
from ...modeling_utils import PreTrainedModel from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import torch_int_div from ...pytorch_utils import torch_int_div
from ...utils import ( from ...utils import (
...@@ -71,35 +71,6 @@ UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -71,35 +71,6 @@ UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
@dataclass
class UniSpeechBaseModelOutput(ModelOutput):
"""
Output type of [`UniSpeechBaseModelOutput`], with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
Sequence of extracted feature vectors of the last convolutional layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor = None
extract_features: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass @dataclass
class UniSpeechForPreTrainingOutput(ModelOutput): class UniSpeechForPreTrainingOutput(ModelOutput):
""" """
...@@ -1158,7 +1129,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): ...@@ -1158,7 +1129,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_PROCESSOR_FOR_DOC, processor_class=_PROCESSOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=UniSpeechBaseModelOutput, output_type=Wav2Vec2BaseModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
modality="audio", modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE, expected_output=_EXPECTED_OUTPUT_SHAPE,
...@@ -1171,7 +1142,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): ...@@ -1171,7 +1142,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[Tuple, UniSpeechBaseModelOutput]: ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
...@@ -1203,7 +1174,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): ...@@ -1203,7 +1174,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
if not return_dict: if not return_dict:
return (hidden_states, extract_features) + encoder_outputs[1:] return (hidden_states, extract_features) + encoder_outputs[1:]
return UniSpeechBaseModelOutput( return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states, last_hidden_state=hidden_states,
extract_features=extract_features, extract_features=extract_features,
hidden_states=encoder_outputs.hidden_states, hidden_states=encoder_outputs.hidden_states,
......
...@@ -81,13 +81,13 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -81,13 +81,13 @@ class UniSpeechSatConfig(PretrainedConfig):
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature encoder states. The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*. *conv_dim*.
...@@ -159,13 +159,13 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -159,13 +159,13 @@ class UniSpeechSatConfig(PretrainedConfig):
instance of [`UniSpeechSatForSequenceClassification`]. instance of [`UniSpeechSatForSequenceClassification`].
classifier_proj_size (`int`, *optional*, defaults to 256): classifier_proj_size (`int`, *optional*, defaults to 256):
Dimensionality of the projection before token mean-pooling for classification. Dimensionality of the projection before token mean-pooling for classification.
tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`): tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`): tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
*XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`): tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
*XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
xvector_output_dim (`int`, *optional*, defaults to 512): xvector_output_dim (`int`, *optional*, defaults to 512):
......
...@@ -27,7 +27,14 @@ from torch.nn import CrossEntropyLoss ...@@ -27,7 +27,14 @@ from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled from ...deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput from ...modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
Wav2Vec2BaseModelOutput,
XVectorOutput,
)
from ...modeling_utils import PreTrainedModel from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import torch_int_div from ...pytorch_utils import torch_int_div
from ...utils import ( from ...utils import (
...@@ -77,35 +84,6 @@ UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -77,35 +84,6 @@ UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
@dataclass
class UniSpeechSatBaseModelOutput(ModelOutput):
"""
Output type of [`UniSpeechSatBaseModelOutput`], with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
Sequence of extracted feature vectors of the last convolutional layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor = None
extract_features: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass @dataclass
class UniSpeechSatForPreTrainingOutput(ModelOutput): class UniSpeechSatForPreTrainingOutput(ModelOutput):
""" """
...@@ -143,38 +121,6 @@ class UniSpeechSatForPreTrainingOutput(ModelOutput): ...@@ -143,38 +121,6 @@ class UniSpeechSatForPreTrainingOutput(ModelOutput):
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XVectorOutput(ModelOutput):
"""
Output type of [`Wav2Vec2ForXVector`].
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
Classification hidden states before AMSoftmax.
embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
Utterance embeddings used for vector similarity-based retrieval.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
embeddings: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
def _compute_mask_indices( def _compute_mask_indices(
shape: Tuple[int, int], shape: Tuple[int, int],
...@@ -1198,7 +1144,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): ...@@ -1198,7 +1144,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_PROCESSOR_FOR_DOC, processor_class=_PROCESSOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=UniSpeechSatBaseModelOutput, output_type=Wav2Vec2BaseModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
modality="audio", modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE, expected_output=_EXPECTED_OUTPUT_SHAPE,
...@@ -1211,7 +1157,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): ...@@ -1211,7 +1157,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[Tuple, UniSpeechSatBaseModelOutput]: ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
...@@ -1243,7 +1189,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): ...@@ -1243,7 +1189,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
if not return_dict: if not return_dict:
return (hidden_states, extract_features) + encoder_outputs[1:] return (hidden_states, extract_features) + encoder_outputs[1:]
return UniSpeechSatBaseModelOutput( return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states, last_hidden_state=hidden_states,
extract_features=extract_features, extract_features=extract_features,
hidden_states=encoder_outputs.hidden_states, hidden_states=encoder_outputs.hidden_states,
......
...@@ -78,13 +78,13 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -78,13 +78,13 @@ class Wav2Vec2Config(PretrainedConfig):
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature encoder states. The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the length of length of *conv_kernel* defines the number of convolutional layers and has to match the length of
*conv_dim*. *conv_dim*.
...@@ -156,13 +156,13 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -156,13 +156,13 @@ class Wav2Vec2Config(PretrainedConfig):
instance of [`Wav2Vec2ForSequenceClassification`]. instance of [`Wav2Vec2ForSequenceClassification`].
classifier_proj_size (`int`, *optional*, defaults to 256): classifier_proj_size (`int`, *optional*, defaults to 256):
Dimensionality of the projection before token mean-pooling for classification. Dimensionality of the projection before token mean-pooling for classification.
tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`): tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`): tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
*XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`): tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
*XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
xvector_output_dim (`int`, *optional*, defaults to 512): xvector_output_dim (`int`, *optional*, defaults to 512):
......
...@@ -33,6 +33,8 @@ from ...modeling_outputs import ( ...@@ -33,6 +33,8 @@ from ...modeling_outputs import (
MaskedLMOutput, MaskedLMOutput,
SequenceClassifierOutput, SequenceClassifierOutput,
TokenClassifierOutput, TokenClassifierOutput,
Wav2Vec2BaseModelOutput,
XVectorOutput,
) )
from ...modeling_utils import PreTrainedModel from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import torch_int_div from ...pytorch_utils import torch_int_div
...@@ -88,35 +90,6 @@ WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -88,35 +90,6 @@ WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
@dataclass
class Wav2Vec2BaseModelOutput(ModelOutput):
"""
Output type of [`Wav2Vec2BaseModelOutput`], with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
Sequence of extracted feature vectors of the last convolutional layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor = None
extract_features: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass @dataclass
class Wav2Vec2ForPreTrainingOutput(ModelOutput): class Wav2Vec2ForPreTrainingOutput(ModelOutput):
""" """
...@@ -159,38 +132,6 @@ class Wav2Vec2ForPreTrainingOutput(ModelOutput): ...@@ -159,38 +132,6 @@ class Wav2Vec2ForPreTrainingOutput(ModelOutput):
diversity_loss: Optional[torch.FloatTensor] = None diversity_loss: Optional[torch.FloatTensor] = None
@dataclass
class XVectorOutput(ModelOutput):
"""
Output type of [`Wav2Vec2ForXVector`].
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
Classification hidden states before AMSoftmax.
embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
Utterance embeddings used for vector similarity-based retrieval.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
embeddings: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
def _compute_mask_indices( def _compute_mask_indices(
shape: Tuple[int, int], shape: Tuple[int, int],
mask_prob: float, mask_prob: float,
...@@ -1025,11 +966,8 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module): ...@@ -1025,11 +966,8 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module):
codevector_probs = codevector_probs.view(batch_size * sequence_length, -1) codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
# use probs to retrieve codevectors # use probs to retrieve codevectors
codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
codevectors = ( codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1) codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
.sum(-2)
.view(batch_size, sequence_length, -1)
)
return codevectors, perplexity return codevectors, perplexity
...@@ -1473,13 +1411,13 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): ...@@ -1473,13 +1411,13 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
```python ```python
>>> import torch >>> import torch
>>> from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForPreTraining >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
>>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> import soundfile as sf
>>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("patrickvonplaten/wav2vec2-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
>>> model = Wav2Vec2ForPreTraining.from_pretrained("patrickvonplaten/wav2vec2-base") >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_wav2vec2_conformer": [
"WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Wav2Vec2ConformerConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_wav2vec2_conformer"] = [
"WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"Wav2Vec2ConformerForAudioFrameClassification",
"Wav2Vec2ConformerForCTC",
"Wav2Vec2ConformerForPreTraining",
"Wav2Vec2ConformerForSequenceClassification",
"Wav2Vec2ConformerForXVector",
"Wav2Vec2ConformerModel",
"Wav2Vec2ConformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_wav2vec2_conformer import (
WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
Wav2Vec2ConformerConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_wav2vec2_conformer import (
WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
Wav2Vec2ConformerForAudioFrameClassification,
Wav2Vec2ConformerForCTC,
Wav2Vec2ConformerForPreTraining,
Wav2Vec2ConformerForSequenceClassification,
Wav2Vec2ConformerForXVector,
Wav2Vec2ConformerModel,
Wav2Vec2ConformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Wav2Vec2Conformer checkpoint."""
import argparse
import json
import os
import fairseq
import torch
from fairseq.data import Dictionary
from transformers import (
Wav2Vec2ConformerConfig,
Wav2Vec2ConformerForCTC,
Wav2Vec2ConformerForPreTraining,
Wav2Vec2CTCTokenizer,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.linear_k": "encoder.layers.*.self_attn.linear_k",
"self_attn.linear_v": "encoder.layers.*.self_attn.linear_v",
"self_attn.linear_q": "encoder.layers.*.self_attn.linear_q",
"self_attn.pos_bias_u": "encoder.layers.*.self_attn.pos_bias_u",
"self_attn.pos_bias_v": "encoder.layers.*.self_attn.pos_bias_v",
"self_attn.linear_out": "encoder.layers.*.self_attn.linear_out",
"self_attn.linear_pos": "encoder.layers.*.self_attn.linear_pos",
"self_attn.rotary_emb": "encoder.embed_positions",
"self_attn_layer_norm": "encoder.layers.*.self_attn_layer_norm",
"conv_module.pointwise_conv1": "encoder.layers.*.conv_module.pointwise_conv1",
"conv_module.pointwise_conv2": "encoder.layers.*.conv_module.pointwise_conv2",
"conv_module.depthwise_conv": "encoder.layers.*.conv_module.depthwise_conv",
"conv_module.batch_norm": "encoder.layers.*.conv_module.batch_norm",
"conv_module.layer_norm": "encoder.layers.*.conv_module.layer_norm",
"ffn1.w_1": "encoder.layers.*.ffn1.intermediate_dense",
"ffn1.w_2": "encoder.layers.*.ffn1.output_dense",
"ffn1.layer_norm": "encoder.layers.*.ffn1_layer_norm",
"ffn2.w_1": "encoder.layers.*.ffn2.intermediate_dense",
"ffn2.w_2": "encoder.layers.*.ffn2.output_dense",
"ffn2.layer_norm": "encoder.layers.*.ffn2_layer_norm",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"quantizer.weight_proj": "quantizer.weight_proj",
"quantizer.vars": "quantizer.codevectors",
"project_q": "project_q",
"final_proj": "project_hid",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
TOP_LEVEL_KEYS = [
"lm_head",
"quantizer.weight_proj",
"quantizer.codevectors",
"project_q",
"project_hid",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
if hf_shape != value.shape:
raise ValueError(
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
elif weight_type == "running_mean":
hf_pointer.running_mean.data = value
elif weight_type == "running_var":
hf_pointer.running_var.data = value
elif weight_type == "num_batches_tracked":
hf_pointer.num_batches_tracked.data = value
elif weight_type == "inv_freq":
hf_pointer.inv_freq.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model, is_headless):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.wav2vec2_conformer.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
mapped_key = "wav2vec2_conformer." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "pos_bias_u" in name:
weight_type = None
elif "pos_bias_v" in name:
weight_type = None
elif "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name:
weight_type = "bias"
elif "weight" in name:
# TODO: don't match quantizer.weight_proj
weight_type = "weight"
elif "running_mean" in name:
weight_type = "running_mean"
elif "inv_freq" in name:
weight_type = "inv_freq"
elif "running_var" in name:
weight_type = "running_var"
elif "num_batches_tracked" in name:
weight_type = "num_batches_tracked"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
# Copied from transformers.models.wav2vec2.convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.load_conv_layer
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
@torch.no_grad()
def convert_wav2vec2_conformer_checkpoint(
checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = Wav2Vec2ConformerConfig.from_pretrained(config_path, hidden_act="swish")
else:
config = Wav2Vec2ConformerConfig()
if "rope" in checkpoint_path:
config.position_embeddings_type = "rotary"
if is_finetuned:
if dict_path:
target_dict = Dictionary.load(dict_path)
# important change bos & pad token id since CTC symbol is <pad> and
# not <s> as in fairseq
config.bos_token_id = target_dict.pad_index
config.pad_token_id = target_dict.bos_index
config.eos_token_id = target_dict.eos_index
config.vocab_size = len(target_dict.symbols)
vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
if not os.path.isdir(pytorch_dump_folder_path):
logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
return
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
vocab_dict = target_dict.indices
# fairseq has the <pad> and <s> switched
vocab_dict["<pad>"] = 0
vocab_dict["<s>"] = 1
with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
json.dump(vocab_dict, vocab_handle)
tokenizer = Wav2Vec2CTCTokenizer(
vocab_path,
unk_token=target_dict.unk_word,
pad_token=target_dict.pad_word,
bos_token=target_dict.bos_word,
eos_token=target_dict.eos_word,
word_delimiter_token="|",
do_lower_case=False,
)
return_attention_mask = True if config.feat_extract_norm == "layer" else False
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0,
do_normalize=True,
return_attention_mask=return_attention_mask,
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(pytorch_dump_folder_path)
hf_wav2vec = Wav2Vec2ConformerForCTC(config)
else:
hf_wav2vec = Wav2Vec2ConformerForPreTraining(config)
if is_finetuned:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
)
else:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
model = model[0].eval()
recursively_load_weights(model, hf_wav2vec, not is_finetuned)
hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args()
convert_wav2vec2_conformer_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
)
...@@ -77,13 +77,13 @@ class WavLMConfig(PretrainedConfig): ...@@ -77,13 +77,13 @@ class WavLMConfig(PretrainedConfig):
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature encoder states. The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*. *conv_dim*.
...@@ -146,13 +146,13 @@ class WavLMConfig(PretrainedConfig): ...@@ -146,13 +146,13 @@ class WavLMConfig(PretrainedConfig):
instance of [`WavLMForSequenceClassification`]. instance of [`WavLMForSequenceClassification`].
classifier_proj_size (`int`, *optional*, defaults to 256): classifier_proj_size (`int`, *optional*, defaults to 256):
Dimensionality of the projection before token mean-pooling for classification. Dimensionality of the projection before token mean-pooling for classification.
tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`): tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`): tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
*XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`): tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
*XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
xvector_output_dim (`int`, *optional*, defaults to 512): xvector_output_dim (`int`, *optional*, defaults to 512):
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import math import math
import warnings import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -28,16 +27,17 @@ from torch.nn import CrossEntropyLoss ...@@ -28,16 +27,17 @@ from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled from ...deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput from ...modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
Wav2Vec2BaseModelOutput,
XVectorOutput,
)
from ...modeling_utils import PreTrainedModel from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import torch_int_div from ...pytorch_utils import torch_int_div
from ...utils import ( from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_wavlm import WavLMConfig from .configuration_wavlm import WavLMConfig
...@@ -80,67 +80,6 @@ WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -80,67 +80,6 @@ WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
] ]
@dataclass
class WavLMBaseModelOutput(ModelOutput):
"""
Output type of [`WavLMBaseModelOutput`], with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
Sequence of extracted feature vectors of the last convolutional layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor = None
extract_features: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XVectorOutput(ModelOutput):
"""
Output type of [`Wav2Vec2ForXVector`].
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
Classification hidden states before AMSoftmax.
embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
Utterance embeddings used for vector similarity-based retrieval.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
embeddings: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
def _compute_mask_indices( def _compute_mask_indices(
shape: Tuple[int, int], shape: Tuple[int, int],
...@@ -1184,7 +1123,7 @@ WAVLM_INPUTS_DOCSTRING = r""" ...@@ -1184,7 +1123,7 @@ WAVLM_INPUTS_DOCSTRING = r"""
"The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.", "The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
WAVLM_START_DOCSTRING, WAVLM_START_DOCSTRING,
) )
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM, WavLMBaseModelOutput->Wav2Vec2BaseModelOutput
class WavLMModel(WavLMPreTrainedModel): class WavLMModel(WavLMPreTrainedModel):
def __init__(self, config: WavLMConfig): def __init__(self, config: WavLMConfig):
super().__init__(config) super().__init__(config)
...@@ -1275,7 +1214,7 @@ class WavLMModel(WavLMPreTrainedModel): ...@@ -1275,7 +1214,7 @@ class WavLMModel(WavLMPreTrainedModel):
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_PROCESSOR_FOR_DOC, processor_class=_PROCESSOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC,
output_type=WavLMBaseModelOutput, output_type=Wav2Vec2BaseModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
modality="audio", modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE, expected_output=_EXPECTED_OUTPUT_SHAPE,
...@@ -1288,7 +1227,7 @@ class WavLMModel(WavLMPreTrainedModel): ...@@ -1288,7 +1227,7 @@ class WavLMModel(WavLMPreTrainedModel):
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[Tuple, WavLMBaseModelOutput]: ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
...@@ -1325,7 +1264,7 @@ class WavLMModel(WavLMPreTrainedModel): ...@@ -1325,7 +1264,7 @@ class WavLMModel(WavLMPreTrainedModel):
if not return_dict: if not return_dict:
return (hidden_states, extract_features) + encoder_outputs[1:] return (hidden_states, extract_features) + encoder_outputs[1:]
return WavLMBaseModelOutput( return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states, last_hidden_state=hidden_states,
extract_features=extract_features, extract_features=extract_features,
hidden_states=encoder_outputs.hidden_states, hidden_states=encoder_outputs.hidden_states,
......
...@@ -4440,6 +4440,58 @@ class Wav2Vec2PreTrainedModel(metaclass=DummyObject): ...@@ -4440,6 +4440,58 @@ class Wav2Vec2PreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
class Wav2Vec2ConformerForAudioFrameClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2ConformerForCTC(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2ConformerForPreTraining(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2ConformerForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2ConformerForXVector(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2ConformerModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Wav2Vec2ConformerPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = None WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -61,6 +61,7 @@ src/transformers/models/vit/modeling_tf_vit.py ...@@ -61,6 +61,7 @@ src/transformers/models/vit/modeling_tf_vit.py
src/transformers/models/vit_mae/modeling_vit_mae.py src/transformers/models/vit_mae/modeling_vit_mae.py
src/transformers/models/wav2vec2/modeling_wav2vec2.py src/transformers/models/wav2vec2/modeling_wav2vec2.py
src/transformers/models/wav2vec2/tokenization_wav2vec2.py src/transformers/models/wav2vec2/tokenization_wav2vec2.py
src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
src/transformers/models/wavlm/modeling_wavlm.py src/transformers/models/wavlm/modeling_wavlm.py
src/transformers/models/yolos/modeling_yolos.py src/transformers/models/yolos/modeling_yolos.py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment