Unverified Commit a7854f33 authored by moto's avatar moto Committed by GitHub
Browse files

Add HuBERT model architectures (#1769)

This commit adds the following HuBERT model architectures

 - `base` (pre-training)
 - `large` (pre-training / fine-tuning)
 - `xlarge` (pre-training / fine-tuning)

Since the internal components are same as `Wav2Vec2Model`, it reuses the existing modules..
With these models, it is possible to 
- import the pre-trained model published by `fairseq` and TorchScript it.
- fine-tune the existing model for downstream task.
parent ecd068f5
......@@ -57,9 +57,8 @@ Wav2Letter
.. automethod:: forward
Wav2Vec2.0
~~~~~~~~~~
Wav2Vec2.0 / HuBERT
~~~~~~~~~~~~~~~~~~~
Model
-----
......@@ -106,6 +105,31 @@ wav2vec2_asr_large_lv60k
.. autofunction:: wav2vec2_asr_large_lv60k
hubert_base
^^^^^^^^^^^
.. autofunction:: hubert_base
hubert_large
^^^^^^^^^^^^
.. autofunction:: hubert_large
hubert_xlarge
^^^^^^^^^^^^^
.. autofunction:: hubert_xlarge
hubert_asr_large
^^^^^^^^^^^^^^^^
.. autofunction:: hubert_asr_large
hubert_asr_xlarge
^^^^^^^^^^^^^^^^^
.. autofunction:: hubert_asr_xlarge
.. currentmodule:: torchaudio.models.wav2vec2.utils
Utility Functions
......
......@@ -6,6 +6,14 @@
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{hsu2021hubert,
title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
year={2021},
eprint={2106.07447},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{hannun2014deep,
title={Deep Speech: Scaling up end-to-end speech recognition},
author={Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng},
......
#!/usr/bin/env python3
"""Generate the conf JSONs from fairseq pretrained weight file, consumed by unit tests
Note:
The current configuration files were generated on fairseq e47a4c84
Usage:
1. Download pretrained parameters from https://github.com/pytorch/fairseq/tree/main/examples/hubert
2. Run this script and save the resulting JSON configuration in assets directory.
Example:
```
python generate_hubert_model_config.py \
--model-file hubert_base_ls960.pt \
> hubert_base_ls960.json
python generate_hubert_model_config.py \
--model-file hubert_large_ll60k.pt \
> hubert_large_ll60k.json
python generate_hubert_model_config.py \
--model-file hubert_large_ll60k_finetune_ls960.pt \
> hubert_large_ll60k_finetune_ls960.json
python generate_hubert_model_config.py \
--model-file hubert_xlarge_ll60k.pt \
> hubert_large_ll60k.json
python generate_hubert_model_config.py \
--model-file hubert_xlarge_ll60k_finetune_ls960.pt \
> hubert_large_ll60k_finetune_ls960.json
```
"""
import json
import argparse
def _parse_args():
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
'--model-file',
required=True,
help=(
'A pt file from '
'https://github.com/pytorch/fairseq/tree/main/examples/hubert'
)
)
return parser.parse_args()
def _load(model_file):
import fairseq
from omegaconf import OmegaConf
models, cfg, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
model = models[0]
cfg = OmegaConf.to_container(cfg)
return model, cfg
def _main():
args = _parse_args()
model, cfg = _load(args.model_file)
if model.__class__.__name__ == 'HubertModel':
cfg['task']['data'] = '/foo/bar'
cfg['task']['label_dir'] = None
conf = {
'_name': 'hubert',
'model': cfg['model'],
'task': cfg['task'],
'num_classes': model.num_classes,
}
elif model.__class__.__name__ == 'HubertCtc':
conf = cfg['model']
del conf['w2v_path']
keep = ['_name', 'task', 'model']
for key in list(k for k in conf['w2v_args'] if k not in keep):
del conf['w2v_args'][key]
conf['data'] = '/foo/bar/'
conf['w2v_args']['task']['data'] = '/foo/bar'
conf['w2v_args']['task']['labels'] = []
conf['w2v_args']['task']['label_dir'] = '/foo/bar'
print(json.dumps(conf, indent=4, sort_keys=True))
if __name__ == '__main__':
_main()
......@@ -11,32 +11,32 @@ Example:
```
# Pretrained
python generate_fairseq_model_config.py \
python generate_wav2vec2_model_config.py \
--model-file wav2vec_small.pt \
> wav2vec_small.json
python generate_fairseq_model_config.py \
python generate_wav2vec2_model_config.py \
--model-file libri960_big.pt \
> libri960_big.json
python generate_fairseq_model_config.py \
python generate_wav2vec2_model_config.py \
--model-file wav2vec_vox_new.pt \
> wav2vec_vox_new.json
# Fine-tuned
python generate_fairseq_model_config.py \
python generate_wav2vec2_model_config.py \
--model-file wav2vec_small_960h.pt \
> wav2vec_small_960h.json
python generate_fairseq_model_config.py \
python generate_wav2vec2_model_config.py \
--model-file wav2vec_big_960h.pt \
> wav2vec_large_960h.json
python generate_fairseq_model_config.py \
python generate_wav2vec2_model_config.py \
--model-file wav2vec2_vox_960h_new.pt \
> wav2vec_large_lv60_960h.json
python generate_fairseq_model_config.py \
python generate_wav2vec2_model_config.py \
--model-file wav2vec_vox_960h_pl.pt \
> wav2vec_large_lv60_self_960h.json
```
......
{
"_name": "hubert",
"model": {
"_name": "hubert",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.1,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 12,
"encoder_embed_dim": 768,
"encoder_ffn_embed_dim": 3072,
"encoder_layerdrop": 0.05,
"encoder_layers": 12,
"extractor_mode": "default",
"feature_grad_mult": 0.1,
"final_dim": 256,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": false,
"logit_temp": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.8,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": false,
"target_glu": false,
"untie_final_proj": false
},
"num_classes": [
504
],
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"fine_tuning": false,
"label_dir": null,
"label_rate": 50,
"labels": [
"layer6.km500"
],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": false,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
{
"_name": "hubert",
"model": {
"_name": "hubert",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.0,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 768,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.8,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"num_classes": [
504
],
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"label_dir": null,
"label_rate": 50,
"labels": [
"lyr9.km500"
],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
{
"_name": "hubert_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"data": "/foo/bar/",
"dropout": 0.0,
"dropout_input": 0.0,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 10000,
"layerdrop": 0.1,
"mask_channel_length": 64,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": true,
"w2v_args": {
"_name": null,
"model": {
"_name": "hubert",
"activation_dropout": 0.1,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.1,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 0.0,
"final_dim": 768,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"fine_tuning": false,
"label_dir": "/foo/bar",
"label_rate": 50,
"labels": [],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
}
{
"_name": "hubert",
"model": {
"_name": "hubert",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1280,
"encoder_ffn_embed_dim": 5120,
"encoder_layerdrop": 0.0,
"encoder_layers": 48,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 1024,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.8,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"num_classes": [
504
],
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"label_dir": null,
"label_rate": 50,
"labels": [
"lyr9.km500"
],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
{
"_name": "hubert_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"data": "/foo/bar/",
"dropout": 0.0,
"dropout_input": 0.0,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 10000,
"layerdrop": 0.1,
"mask_channel_length": 64,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": true,
"w2v_args": {
"_name": null,
"model": {
"_name": "hubert",
"activation_dropout": 0.1,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1280,
"encoder_ffn_embed_dim": 5120,
"encoder_layerdrop": 0.1,
"encoder_layers": 48,
"extractor_mode": "layer_norm",
"feature_grad_mult": 0.0,
"final_dim": 1024,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"fine_tuning": false,
"label_dir": "/foo/bar",
"label_rate": 50,
"labels": [],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
}
......@@ -9,6 +9,11 @@ from torchaudio.models.wav2vec2 import (
wav2vec2_base,
wav2vec2_large,
wav2vec2_large_lv60k,
hubert_base,
hubert_large,
hubert_xlarge,
hubert_asr_large,
hubert_asr_xlarge,
)
from torchaudio.models.wav2vec2.utils import (
import_fairseq_model,
......@@ -31,29 +36,51 @@ def _name_func(testcase_func, i, param):
return f'{testcase_func.__name__}_{i}_{param[0][1].__name__}'
# Pretraining (not fine-tuned) models
BASE = _load_config('wav2vec_small')
LARGE = _load_config('libri960_big')
LARGE_LV60K = _load_config('wav2vec_vox_new')
XLSR_53_56K = _load_config('xlsr_53_56k')
# Fine-tuned models
BASE_960H = _load_config('wav2vec_small_960h')
LARGE_960H = _load_config('wav2vec_large_960h')
LARGE_LV60K_960H = _load_config('wav2vec_large_lv60k_960h')
LARGE_LV60K_SELF_960H = _load_config('wav2vec_large_lv60k_self_960h')
# Pretraining models
WAV2VEC2_BASE = _load_config('wav2vec_small')
WAV2VEC2_LARGE = _load_config('libri960_big')
WAV2VEC2_LARGE_LV60K = _load_config('wav2vec_vox_new')
WAV2VEC2_XLSR_53_56K = _load_config('xlsr_53_56k')
HUBERT_BASE = _load_config('hubert_base_ls960')
HUBERT_LARGE_LL60K = _load_config('hubert_large_ll60k')
HUBERT_XLARGE_LL60K = _load_config('hubert_large_ll60k')
# Finetuning models
WAV2VEC2_BASE_960H = _load_config('wav2vec_small_960h')
WAV2VEC2_LARGE_960H = _load_config('wav2vec_large_960h')
WAV2VEC2_LARGE_LV60K_960H = _load_config('wav2vec_large_lv60k_960h')
WAV2VEC2_LARGE_LV60K_SELF_960H = _load_config('wav2vec_large_lv60k_self_960h')
HUBERT_LARGE = _load_config('hubert_large_ll60k_finetune_ls960')
HUBERT_XLARGE = _load_config('hubert_xtralarge_ll60k_finetune_ls960')
# Config and corresponding factory functions
PRETRAINING_CONFIGS = parameterized.expand([
(BASE, wav2vec2_base),
(LARGE, wav2vec2_large),
(LARGE_LV60K, wav2vec2_large_lv60k),
(XLSR_53_56K, wav2vec2_large_lv60k),
WAV2VEC2_PRETRAINING_CONFIGS = parameterized.expand([
(WAV2VEC2_BASE, wav2vec2_base),
(WAV2VEC2_LARGE, wav2vec2_large),
(WAV2VEC2_LARGE_LV60K, wav2vec2_large_lv60k),
(WAV2VEC2_XLSR_53_56K, wav2vec2_large_lv60k),
], name_func=_name_func)
HUBERT_PRETRAINING_CONFIGS = parameterized.expand([
(HUBERT_BASE, hubert_base),
(HUBERT_LARGE_LL60K, hubert_large),
(HUBERT_XLARGE_LL60K, hubert_xlarge),
], name_func=_name_func)
FINETUNED_CONFIGS = parameterized.expand([
(BASE_960H, wav2vec2_asr_base),
(LARGE_960H, wav2vec2_asr_large),
(LARGE_LV60K_960H, wav2vec2_asr_large_lv60k),
(LARGE_LV60K_SELF_960H, wav2vec2_asr_large_lv60k),
ALL_PRETRAINING_CONFIGS = parameterized.expand([
(WAV2VEC2_BASE, wav2vec2_base),
(WAV2VEC2_LARGE, wav2vec2_large),
(WAV2VEC2_LARGE_LV60K, wav2vec2_large_lv60k),
(WAV2VEC2_XLSR_53_56K, wav2vec2_large_lv60k),
(HUBERT_BASE, hubert_base),
(HUBERT_LARGE_LL60K, hubert_large),
(HUBERT_XLARGE_LL60K, hubert_xlarge),
], name_func=_name_func)
FINETUNING_CONFIGS = parameterized.expand([
(WAV2VEC2_BASE_960H, wav2vec2_asr_base),
(WAV2VEC2_LARGE_960H, wav2vec2_asr_large),
(WAV2VEC2_LARGE_LV60K_960H, wav2vec2_asr_large_lv60k),
(WAV2VEC2_LARGE_LV60K_SELF_960H, wav2vec2_asr_large_lv60k),
(HUBERT_LARGE, hubert_asr_large),
(HUBERT_XLARGE, hubert_asr_xlarge),
], name_func=_name_func)
......@@ -76,6 +103,15 @@ class TestFairseqIntegration(TorchaudioTestCase):
Wav2VecEncoder,
Wav2Vec2CtcConfig,
)
from fairseq.models.hubert.hubert_asr import (
HubertCtcConfig,
HubertEncoder,
)
from fairseq.models.hubert.hubert import (
HubertModel,
HubertConfig,
)
from fairseq.tasks.hubert_pretraining import HubertPretrainingConfig
if config['_name'] == 'wav2vec_ctc':
config = copy.deepcopy(config)
......@@ -83,10 +119,22 @@ class TestFairseqIntegration(TorchaudioTestCase):
return Wav2VecEncoder(Wav2Vec2CtcConfig(**config), num_out)
if config['_name'] == 'wav2vec2':
return Wav2Vec2Model(Wav2Vec2Config(**config))
if config['_name'] == 'hubert_ctc':
config = copy.deepcopy(config)
config['w2v_args'] = OmegaConf.create(config['w2v_args'])
ctc_cfg = HubertCtcConfig(**config)
return HubertEncoder(ctc_cfg, tgt_dict=range(num_out))
if config['_name'] == 'hubert':
dicts = [list(range(i)) for i in config['num_classes']]
return HubertModel(
HubertConfig(**config['model']),
HubertPretrainingConfig(**config['task']),
dicts,
)
raise ValueError(f'Unexpected configuration: {config["_name"]}')
@PRETRAINING_CONFIGS
def test_import_pretraining_model(self, config, _):
@WAV2VEC2_PRETRAINING_CONFIGS
def test_import_wave2vec2_pretraining_model(self, config, _):
"""Wav2vec2 pretraining models from fairseq can be imported and yields the same results"""
batch_size, num_frames = 3, 1024
atol = 1.1e-05 if sys.platform == "darwin" else 1e-05
......@@ -106,7 +154,22 @@ class TestFairseqIntegration(TorchaudioTestCase):
for i, (ref, _) in enumerate(refs['layer_results']):
self.assertEqual(hyp[i], ref.transpose(0, 1), atol=atol, rtol=1.3e-06)
@PRETRAINING_CONFIGS
@HUBERT_PRETRAINING_CONFIGS
def test_import_hubert_pretraining_model(self, config, _):
"""HuBERT pretraining models from fairseq can be imported and yields the same results"""
batch_size, num_frames = 3, 1024
original = self._get_model(config).eval()
imported = import_fairseq_model(original).eval()
x = torch.randn(batch_size, num_frames)
mask = torch.zeros_like(x)
hyp, _ = imported.extract_features(x)
for i in range(len(original.encoder.layers)):
ref, _ = original.extract_features(x, padding_mask=mask, output_layer=i + 1)
self.assertEqual(hyp[i], ref)
@ALL_PRETRAINING_CONFIGS
def test_recreate_pretraining_model(self, config, factory_func):
"""Imported pretraining models can be recreated via a factory function without fairseq."""
batch_size, num_frames = 3, 1024
......@@ -131,8 +194,8 @@ class TestFairseqIntegration(TorchaudioTestCase):
self.assertEqual(ref, hyp)
self.assertEqual(ref_lengths, hyp_lengths)
@FINETUNED_CONFIGS
def test_import_finetuned_model(self, config, _):
@FINETUNING_CONFIGS
def test_import_finetuning_model(self, config, _):
"""Fintuned wav2vec2 models from fairseq can be imported and yields the same results"""
num_out = 28
batch_size, num_frames = 3, 1024
......@@ -154,9 +217,9 @@ class TestFairseqIntegration(TorchaudioTestCase):
for i, l in enumerate(output_lengths):
self.assertEqual(ref[i, :l, ...], hyp[i, :l, ...])
@FINETUNED_CONFIGS
def test_recreate_finetuned_model(self, config, factory_func):
"""Imported finetuned models can be recreated via a factory function without fairseq."""
@FINETUNING_CONFIGS
def test_recreate_finetuning_model(self, config, factory_func):
"""Imported finetuning models can be recreated via a factory function without fairseq."""
num_out = 28
batch_size, num_frames = 3, 1024
......
......@@ -8,6 +8,11 @@ from torchaudio.models.wav2vec2 import (
wav2vec2_base,
wav2vec2_large,
wav2vec2_large_lv60k,
hubert_base,
hubert_large,
hubert_xlarge,
hubert_asr_large,
hubert_asr_xlarge,
)
from torchaudio_unittest.common_utils import (
TorchaudioTestCase,
......@@ -26,6 +31,9 @@ pretrain_factory_funcs = parameterized.expand([
(wav2vec2_base, ),
(wav2vec2_large, ),
(wav2vec2_large_lv60k, ),
(hubert_base, ),
(hubert_large, ),
(hubert_xlarge, ),
], name_func=_name_func)
......@@ -33,6 +41,8 @@ finetune_factory_funcs = parameterized.expand([
(wav2vec2_asr_base, ),
(wav2vec2_asr_large, ),
(wav2vec2_asr_large_lv60k, ),
(hubert_asr_large, ),
(hubert_asr_xlarge, ),
], name_func=_name_func)
......
......@@ -11,9 +11,13 @@ from .wav2vec2 import (
wav2vec2_base,
wav2vec2_large,
wav2vec2_large_lv60k,
hubert_base,
hubert_large,
hubert_xlarge,
hubert_asr_large,
hubert_asr_xlarge,
)
__all__ = [
'Wav2Letter',
'WaveRNN',
......@@ -27,6 +31,11 @@ __all__ = [
'wav2vec2_base',
'wav2vec2_large',
'wav2vec2_large_lv60k',
'hubert_base',
'hubert_large',
'hubert_xlarge',
'hubert_asr_large',
'hubert_asr_xlarge',
'Tacotron2',
'tacotron2',
]
......@@ -6,6 +6,11 @@ from .model import (
wav2vec2_base,
wav2vec2_large,
wav2vec2_large_lv60k,
hubert_base,
hubert_large,
hubert_xlarge,
hubert_asr_large,
hubert_asr_xlarge,
)
from . import utils
......@@ -17,5 +22,10 @@ __all__ = [
'wav2vec2_base',
'wav2vec2_large',
'wav2vec2_large_lv60k',
'hubert_base',
'hubert_large',
'hubert_xlarge',
'hubert_asr_large',
'hubert_asr_xlarge',
'utils',
]
......@@ -328,3 +328,157 @@ def wav2vec2_asr_large_lv60k(num_out: int) -> Wav2Vec2Model:
encoder_layer_drop=0.1,
aux_num_out=num_out,
)
def hubert_base() -> Wav2Vec2Model:
"""Build HuBERT model with "Base" configuration
This is one of the model architectures used in *HuBERT*
[:footcite:`hsu2021hubert`] for pretraining.
Returns:
HuBERT: The resulting model.
"""
return _get_model(
extractor_mode='group_norm',
extractor_conv_layer_config=None,
extractor_conv_bias=False,
encoder_embed_dim=768,
encoder_projection_dropout=0.1,
encoder_pos_conv_kernel=128,
encoder_pos_conv_groups=16,
encoder_num_layers=12,
encoder_num_heads=12,
encoder_attention_dropout=0.1,
encoder_ff_interm_features=3072,
encoder_ff_interm_dropout=0.0,
encoder_dropout=0.1,
encoder_layer_norm_first=False,
encoder_layer_drop=0.05,
aux_num_out=None,
)
def hubert_large() -> Wav2Vec2Model:
"""Build HuBERT model with "Large" configuration
This is one of the model architectures used in *HuBERT*
[:footcite:`hsu2021hubert`] for pretraining.
Returns:
HuBERT: The resulting model.
"""
return _get_model(
extractor_mode='layer_norm',
extractor_conv_layer_config=None,
extractor_conv_bias=False,
encoder_embed_dim=1024,
encoder_projection_dropout=0.0,
encoder_pos_conv_kernel=128,
encoder_pos_conv_groups=16,
encoder_num_layers=24,
encoder_num_heads=16,
encoder_attention_dropout=0.0,
encoder_ff_interm_features=4096,
encoder_ff_interm_dropout=0.0,
encoder_dropout=0.0,
encoder_layer_norm_first=True,
encoder_layer_drop=0.0,
aux_num_out=None,
)
def hubert_asr_large(num_out) -> Wav2Vec2Model:
"""Build "Large" HuBERT model with an extra linear module
This is one of the model architecture used in *HuBERT*
[:footcite:`hsu2021hubert`] for fine-tuning for ASR task.
Args:
num_out: int
The number of output labels.
Returns:
Wav2Vec2Model:
"""
return _get_model(
extractor_mode='layer_norm',
extractor_conv_layer_config=None,
extractor_conv_bias=False,
encoder_embed_dim=1024,
encoder_projection_dropout=0.0,
encoder_pos_conv_kernel=128,
encoder_pos_conv_groups=16,
encoder_num_layers=24,
encoder_num_heads=16,
encoder_attention_dropout=0.0,
encoder_ff_interm_features=4096,
encoder_ff_interm_dropout=0.1,
encoder_dropout=0.0,
encoder_layer_norm_first=True,
encoder_layer_drop=0.1,
aux_num_out=num_out,
)
def hubert_xlarge() -> Wav2Vec2Model:
"""Build HuBERT model with "extra large" configuration
This is one of the model architectures used in *HuBERT*
[:footcite:`hsu2021hubert`] for pretraining.
Returns:
HuBERT: The resulting model.
"""
return _get_model(
extractor_mode='layer_norm',
extractor_conv_layer_config=None,
extractor_conv_bias=False,
encoder_embed_dim=1024,
encoder_projection_dropout=0.0,
encoder_pos_conv_kernel=128,
encoder_pos_conv_groups=16,
encoder_num_layers=24,
encoder_num_heads=16,
encoder_attention_dropout=0.0,
encoder_ff_interm_features=4096,
encoder_ff_interm_dropout=0.0,
encoder_dropout=0.0,
encoder_layer_norm_first=True,
encoder_layer_drop=0.0,
aux_num_out=None,
)
def hubert_asr_xlarge(num_out) -> Wav2Vec2Model:
"""Build "extra large" HuBERT model with an extra linear module
This is one of the model architecture used in *HuBERT*
[:footcite:`hsu2021hubert`] for fine-tuning for ASR task.
Args:
num_out: int
The number of output labels.
Returns:
Wav2Vec2Model: The resulting model.
"""
return _get_model(
extractor_mode='layer_norm',
extractor_conv_layer_config=None,
extractor_conv_bias=False,
encoder_embed_dim=1280,
encoder_projection_dropout=0.0,
encoder_pos_conv_kernel=128,
encoder_pos_conv_groups=16,
encoder_num_layers=48,
encoder_num_heads=16,
encoder_attention_dropout=0.0,
encoder_ff_interm_features=5120,
encoder_ff_interm_dropout=0.1,
encoder_dropout=0.0,
encoder_layer_norm_first=True,
encoder_layer_drop=0.1,
aux_num_out=num_out,
)
......@@ -110,6 +110,9 @@ def _map_key(key):
# Only relevant when loading fine-tuned models
if match:
return f"aux.{match.group(1)}"
# HuBERT Extension
if key in ['label_embs_concat']:
return key
raise ValueError(f'Unexpected key: {key_}')
......@@ -127,9 +130,10 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:
Args:
original (torch.nn.Module):
An instance of fairseq's Wav2Vec2.0 model class.
Either ``fairseq.models.wav2vec.wav2vec2_asr.Wav2VecEncoder`` or
``fairseq.models.wav2vec.wav2vec2.Wav2Vec2Model``.
An instance of fairseq's Wav2Vec2.0 or HuBERT model.
One of ``fairseq.models.wav2vec.wav2vec2_asr.Wav2VecEncoder``,
``fairseq.models.wav2vec.wav2vec2.Wav2Vec2Model`` or
``fairseq.models.hubert.hubert_asr.HubertEncoder``.
Returns:
Wav2Vec2Model: Imported model.
......@@ -173,21 +177,39 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:
"""
class_ = original.__class__.__name__
if class_ == 'Wav2Vec2Model':
return _import_pretrained(original)
return _import_wav2vec2_pretraining(original)
if class_ == 'Wav2VecEncoder':
return _import_finetuned(original)
return _import_wav2vec2_finetuning(original)
if class_ == 'HubertModel':
return _import_hubert_pretraining(original)
if class_ == 'HubertEncoder':
return _import_hubert_finetuning(original)
raise ValueError(
f'Expected an instance of `Wav2Vec2Model` or `Wav2VecEncoder`. Found: {class_}')
def _import_finetuned(original: Module) -> Wav2Vec2Model:
def _import_wav2vec2_finetuning(original: Module) -> Wav2Vec2Model:
config = _parse_config(original.w2v_model)
model = _get_model(**config, aux_num_out=original.proj.out_features)
model.load_state_dict(_convert_state_dict(original.state_dict()))
return model
def _import_pretrained(original: Module) -> Wav2Vec2Model:
def _import_wav2vec2_pretraining(original: Module) -> Wav2Vec2Model:
config = _parse_config(original)
model = _get_model(**config, aux_num_out=None)
model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
return model
def _import_hubert_finetuning(original: Module) -> Wav2Vec2Model:
config = _parse_config(original.w2v_model)
model = _get_model(**config, aux_num_out=original.proj.out_features)
model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
return model
def _import_hubert_pretraining(original: Module) -> Wav2Vec2Model:
config = _parse_config(original)
model = _get_model(**config, aux_num_out=None)
model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment