Add HuBERT model architectures (#1769)

This commit adds the following HuBERT model architectures - `base` (pre-training) - `large` (pre-training / fine-tuning) - `xlarge` (pre-training / fine-tuning) Since the internal components are same as `Wav2Vec2Model`, it reuses the existing modules.. With these models, it is possible to - import the pre-trained model published by `fairseq` and TorchScript it. - fine-tune the existing model for downstream task.

Add HuBERT model architectures (#1769)
This commit adds the following HuBERT model architectures - `base` (pre-training) - `large` (pre-training / fine-tuning) - `xlarge` (pre-training / fine-tuning) Since the internal components are same as `Wav2Vec2Model`, it reuses the existing modules.. With these models, it is possible to - import the pre-trained model published by `fairseq` and TorchScript it. - fine-tune the existing model for downstream task.
a7854f33 · moto · GitHub · ecd068f5 · a7854f33 · a7854f33
Unverified Commit a7854f33 authored Sep 28, 2021 by moto Committed by GitHub Sep 28, 2021
15 changed files
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -57,9 +57,8 @@ Wav2Letter
  .. automethod:: forward


-
-Wav2Vec2.0
-~~~~~~~~~~
+Wav2Vec2.0 / HuBERT
+~~~~~~~~~~~~~~~~~~~

 Model
 -----
@@ -106,6 +105,31 @@ wav2vec2_asr_large_lv60k

 .. autofunction:: wav2vec2_asr_large_lv60k

+hubert_base
+^^^^^^^^^^^
+
+.. autofunction:: hubert_base
+
+hubert_large
+^^^^^^^^^^^^
+
+.. autofunction:: hubert_large
+
+hubert_xlarge
+^^^^^^^^^^^^^
+
+.. autofunction:: hubert_xlarge
+
+hubert_asr_large
+^^^^^^^^^^^^^^^^
+
+.. autofunction:: hubert_asr_large
+
+hubert_asr_xlarge
+^^^^^^^^^^^^^^^^^
+
+.. autofunction:: hubert_asr_xlarge
+
 .. currentmodule:: torchaudio.models.wav2vec2.utils

 Utility Functions

--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -6,6 +6,14 @@
      archivePrefix={arXiv},
      primaryClass={cs.CL}
 }
+@misc{hsu2021hubert,
+      title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units}, 
+      author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
+      year={2021},
+      eprint={2106.07447},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
 @misc{hannun2014deep,
      title={Deep Speech: Scaling up end-to-end speech recognition}, 
      author={Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng},

--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
+#!/usr/bin/env python3
+"""Generate the conf JSONs from fairseq pretrained weight file, consumed by unit tests
+
+Note:
+    The current configuration files were generated on fairseq e47a4c84
+
+Usage:
+1. Download pretrained parameters from https://github.com/pytorch/fairseq/tree/main/examples/hubert
+2. Run this script and save the resulting JSON configuration in assets directory.
+
+Example:
+
+```
+python generate_hubert_model_config.py \
+    --model-file hubert_base_ls960.pt \
+    > hubert_base_ls960.json
+
+python generate_hubert_model_config.py \
+    --model-file hubert_large_ll60k.pt \
+    > hubert_large_ll60k.json
+
+python generate_hubert_model_config.py \
+    --model-file hubert_large_ll60k_finetune_ls960.pt \
+    > hubert_large_ll60k_finetune_ls960.json
+
+python generate_hubert_model_config.py \
+    --model-file hubert_xlarge_ll60k.pt \
+    > hubert_large_ll60k.json
+
+python generate_hubert_model_config.py \
+    --model-file hubert_xlarge_ll60k_finetune_ls960.pt \
+    > hubert_large_ll60k_finetune_ls960.json
+```
+"""
+import json
+import argparse
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        '--model-file',
+        required=True,
+        help=(
+            'A pt file from '
+            'https://github.com/pytorch/fairseq/tree/main/examples/hubert'
+        )
+    )
+    return parser.parse_args()
+
+
+def _load(model_file):
+    import fairseq
+    from omegaconf import OmegaConf
+
+    models, cfg, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
+    model = models[0]
+    cfg = OmegaConf.to_container(cfg)
+    return model, cfg
+
+
+def _main():
+    args = _parse_args()
+    model, cfg = _load(args.model_file)
+
+    if model.__class__.__name__ == 'HubertModel':
+        cfg['task']['data'] = '/foo/bar'
+        cfg['task']['label_dir'] = None
+        conf = {
+            '_name': 'hubert',
+            'model': cfg['model'],
+            'task': cfg['task'],
+            'num_classes': model.num_classes,
+        }
+    elif model.__class__.__name__ == 'HubertCtc':
+        conf = cfg['model']
+        del conf['w2v_path']
+        keep = ['_name', 'task', 'model']
+        for key in list(k for k in conf['w2v_args'] if k not in keep):
+            del conf['w2v_args'][key]
+        conf['data'] = '/foo/bar/'
+        conf['w2v_args']['task']['data'] = '/foo/bar'
+        conf['w2v_args']['task']['labels'] = []
+        conf['w2v_args']['task']['label_dir'] = '/foo/bar'
+    print(json.dumps(conf, indent=4, sort_keys=True))
+
+
+if __name__ == '__main__':
+    _main()
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_fairseq_model_config.py
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_fairseq_model_config.py
@@ -11,32 +11,32 @@ Example:

 ```
 # Pretrained
-python generate_fairseq_model_config.py \
+python generate_wav2vec2_model_config.py \
    --model-file wav2vec_small.pt \
    > wav2vec_small.json

-python generate_fairseq_model_config.py \
+python generate_wav2vec2_model_config.py \
    --model-file libri960_big.pt \
    > libri960_big.json

-python generate_fairseq_model_config.py \
+python generate_wav2vec2_model_config.py \
    --model-file wav2vec_vox_new.pt \
    > wav2vec_vox_new.json

 # Fine-tuned
-python generate_fairseq_model_config.py \
+python generate_wav2vec2_model_config.py \
    --model-file wav2vec_small_960h.pt \
    > wav2vec_small_960h.json

-python generate_fairseq_model_config.py \
+python generate_wav2vec2_model_config.py \
    --model-file wav2vec_big_960h.pt \
    > wav2vec_large_960h.json

-python generate_fairseq_model_config.py \
+python generate_wav2vec2_model_config.py \
    --model-file wav2vec2_vox_960h_new.pt \
    > wav2vec_large_lv60_960h.json

-python generate_fairseq_model_config.py \
+python generate_wav2vec2_model_config.py \
    --model-file wav2vec_vox_960h_pl.pt \
    > wav2vec_large_lv60_self_960h.json
 ```

--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_base_ls960.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_base_ls960.json
+{
+    "_name": "hubert",
+    "model": {
+        "_name": "hubert",
+        "activation_dropout": 0.0,
+        "activation_fn": "gelu",
+        "attention_dropout": 0.1,
+        "conv_bias": false,
+        "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        "conv_pos": 128,
+        "conv_pos_groups": 16,
+        "dropout": 0.1,
+        "dropout_features": 0.1,
+        "dropout_input": 0.1,
+        "encoder_attention_heads": 12,
+        "encoder_embed_dim": 768,
+        "encoder_ffn_embed_dim": 3072,
+        "encoder_layerdrop": 0.05,
+        "encoder_layers": 12,
+        "extractor_mode": "default",
+        "feature_grad_mult": 0.1,
+        "final_dim": 256,
+        "label_rate": 50,
+        "latent_temp": [
+            2.0,
+            0.5,
+            0.999995
+        ],
+        "layer_norm_first": false,
+        "logit_temp": 0.1,
+        "mask_channel_length": 10,
+        "mask_channel_min_space": 1,
+        "mask_channel_other": 0.0,
+        "mask_channel_prob": 0.0,
+        "mask_channel_selection": "static",
+        "mask_length": 10,
+        "mask_min_space": 1,
+        "mask_other": 0.0,
+        "mask_prob": 0.8,
+        "mask_selection": "static",
+        "no_mask_channel_overlap": false,
+        "no_mask_overlap": false,
+        "skip_masked": false,
+        "skip_nomask": false,
+        "target_glu": false,
+        "untie_final_proj": false
+    },
+    "num_classes": [
+        504
+    ],
+    "task": {
+        "_name": "hubert_pretraining",
+        "data": "/foo/bar",
+        "enable_padding": false,
+        "fine_tuning": false,
+        "label_dir": null,
+        "label_rate": 50,
+        "labels": [
+            "layer6.km500"
+        ],
+        "max_sample_size": 250000,
+        "min_sample_size": 32000,
+        "normalize": false,
+        "pad_audio": false,
+        "random_crop": true,
+        "sample_rate": 16000,
+        "single_target": false
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k.json
+{
+    "_name": "hubert",
+    "model": {
+        "_name": "hubert",
+        "activation_dropout": 0.0,
+        "activation_fn": "gelu",
+        "attention_dropout": 0.0,
+        "conv_bias": false,
+        "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        "conv_pos": 128,
+        "conv_pos_groups": 16,
+        "dropout": 0.0,
+        "dropout_features": 0.0,
+        "dropout_input": 0.0,
+        "encoder_attention_heads": 16,
+        "encoder_embed_dim": 1024,
+        "encoder_ffn_embed_dim": 4096,
+        "encoder_layerdrop": 0.0,
+        "encoder_layers": 24,
+        "extractor_mode": "layer_norm",
+        "feature_grad_mult": 1.0,
+        "final_dim": 768,
+        "label_rate": 50,
+        "latent_temp": [
+            2.0,
+            0.5,
+            0.999995
+        ],
+        "layer_norm_first": true,
+        "logit_temp": 0.1,
+        "mask_channel_length": 10,
+        "mask_channel_min_space": 1,
+        "mask_channel_other": 0.0,
+        "mask_channel_prob": 0.0,
+        "mask_channel_selection": "static",
+        "mask_length": 10,
+        "mask_min_space": 1,
+        "mask_other": 0.0,
+        "mask_prob": 0.8,
+        "mask_selection": "static",
+        "no_mask_channel_overlap": false,
+        "no_mask_overlap": false,
+        "skip_masked": false,
+        "skip_nomask": true,
+        "target_glu": false,
+        "untie_final_proj": true
+    },
+    "num_classes": [
+        504
+    ],
+    "task": {
+        "_name": "hubert_pretraining",
+        "data": "/foo/bar",
+        "enable_padding": false,
+        "label_dir": null,
+        "label_rate": 50,
+        "labels": [
+            "lyr9.km500"
+        ],
+        "max_sample_size": 250000,
+        "min_sample_size": 32000,
+        "normalize": true,
+        "pad_audio": false,
+        "random_crop": true,
+        "sample_rate": 16000,
+        "single_target": false
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k_finetune_ls960.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k_finetune_ls960.json
+{
+    "_name": "hubert_ctc",
+    "activation_dropout": 0.1,
+    "apply_mask": true,
+    "attention_dropout": 0.0,
+    "data": "/foo/bar/",
+    "dropout": 0.0,
+    "dropout_input": 0.0,
+    "feature_grad_mult": 0.0,
+    "final_dropout": 0.0,
+    "freeze_finetune_updates": 10000,
+    "layerdrop": 0.1,
+    "mask_channel_length": 64,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.25,
+    "mask_channel_selection": "static",
+    "mask_length": 10,
+    "mask_other": 0.0,
+    "mask_prob": 0.5,
+    "mask_selection": "static",
+    "no_mask_channel_overlap": false,
+    "no_mask_overlap": false,
+    "no_pretrained_weights": false,
+    "normalize": true,
+    "w2v_args": {
+        "_name": null,
+        "model": {
+            "_name": "hubert",
+            "activation_dropout": 0.1,
+            "activation_fn": "gelu",
+            "attention_dropout": 0.0,
+            "conv_bias": false,
+            "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+            "conv_pos": 128,
+            "conv_pos_groups": 16,
+            "dropout": 0.0,
+            "dropout_features": 0.0,
+            "dropout_input": 0.0,
+            "encoder_attention_heads": 16,
+            "encoder_embed_dim": 1024,
+            "encoder_ffn_embed_dim": 4096,
+            "encoder_layerdrop": 0.1,
+            "encoder_layers": 24,
+            "extractor_mode": "layer_norm",
+            "feature_grad_mult": 0.0,
+            "final_dim": 768,
+            "label_rate": 50,
+            "latent_temp": [
+                2.0,
+                0.5,
+                0.999995
+            ],
+            "layer_norm_first": true,
+            "logit_temp": 0.1,
+            "mask_channel_length": 64,
+            "mask_channel_min_space": 1,
+            "mask_channel_other": 0.0,
+            "mask_channel_prob": 0.25,
+            "mask_channel_selection": "static",
+            "mask_length": 10,
+            "mask_min_space": 1,
+            "mask_other": 0.0,
+            "mask_prob": 0.5,
+            "mask_selection": "static",
+            "no_mask_channel_overlap": false,
+            "no_mask_overlap": false,
+            "skip_masked": false,
+            "skip_nomask": true,
+            "target_glu": false,
+            "untie_final_proj": true
+        },
+        "task": {
+            "_name": "hubert_pretraining",
+            "data": "/foo/bar",
+            "enable_padding": false,
+            "fine_tuning": false,
+            "label_dir": "/foo/bar",
+            "label_rate": 50,
+            "labels": [],
+            "max_sample_size": 250000,
+            "min_sample_size": 32000,
+            "normalize": true,
+            "pad_audio": false,
+            "random_crop": true,
+            "sample_rate": 16000,
+            "single_target": false
+        }
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k.json
+{
+    "_name": "hubert",
+    "model": {
+        "_name": "hubert",
+        "activation_dropout": 0.0,
+        "activation_fn": "gelu",
+        "attention_dropout": 0.0,
+        "conv_bias": false,
+        "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        "conv_pos": 128,
+        "conv_pos_groups": 16,
+        "dropout": 0.0,
+        "dropout_features": 0.0,
+        "dropout_input": 0.0,
+        "encoder_attention_heads": 16,
+        "encoder_embed_dim": 1280,
+        "encoder_ffn_embed_dim": 5120,
+        "encoder_layerdrop": 0.0,
+        "encoder_layers": 48,
+        "extractor_mode": "layer_norm",
+        "feature_grad_mult": 1.0,
+        "final_dim": 1024,
+        "label_rate": 50,
+        "latent_temp": [
+            2.0,
+            0.5,
+            0.999995
+        ],
+        "layer_norm_first": true,
+        "logit_temp": 0.1,
+        "mask_channel_length": 10,
+        "mask_channel_min_space": 1,
+        "mask_channel_other": 0.0,
+        "mask_channel_prob": 0.0,
+        "mask_channel_selection": "static",
+        "mask_length": 10,
+        "mask_min_space": 1,
+        "mask_other": 0.0,
+        "mask_prob": 0.8,
+        "mask_selection": "static",
+        "no_mask_channel_overlap": false,
+        "no_mask_overlap": false,
+        "skip_masked": false,
+        "skip_nomask": true,
+        "target_glu": false,
+        "untie_final_proj": true
+    },
+    "num_classes": [
+        504
+    ],
+    "task": {
+        "_name": "hubert_pretraining",
+        "data": "/foo/bar",
+        "enable_padding": false,
+        "label_dir": null,
+        "label_rate": 50,
+        "labels": [
+            "lyr9.km500"
+        ],
+        "max_sample_size": 250000,
+        "min_sample_size": 32000,
+        "normalize": true,
+        "pad_audio": false,
+        "random_crop": true,
+        "sample_rate": 16000,
+        "single_target": false
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k_finetune_ls960.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k_finetune_ls960.json
+{
+    "_name": "hubert_ctc",
+    "activation_dropout": 0.1,
+    "apply_mask": true,
+    "attention_dropout": 0.0,
+    "data": "/foo/bar/",
+    "dropout": 0.0,
+    "dropout_input": 0.0,
+    "feature_grad_mult": 0.0,
+    "final_dropout": 0.0,
+    "freeze_finetune_updates": 10000,
+    "layerdrop": 0.1,
+    "mask_channel_length": 64,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.25,
+    "mask_channel_selection": "static",
+    "mask_length": 10,
+    "mask_other": 0.0,
+    "mask_prob": 0.5,
+    "mask_selection": "static",
+    "no_mask_channel_overlap": false,
+    "no_mask_overlap": false,
+    "no_pretrained_weights": false,
+    "normalize": true,
+    "w2v_args": {
+        "_name": null,
+        "model": {
+            "_name": "hubert",
+            "activation_dropout": 0.1,
+            "activation_fn": "gelu",
+            "attention_dropout": 0.0,
+            "conv_bias": false,
+            "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+            "conv_pos": 128,
+            "conv_pos_groups": 16,
+            "dropout": 0.0,
+            "dropout_features": 0.0,
+            "dropout_input": 0.0,
+            "encoder_attention_heads": 16,
+            "encoder_embed_dim": 1280,
+            "encoder_ffn_embed_dim": 5120,
+            "encoder_layerdrop": 0.1,
+            "encoder_layers": 48,
+            "extractor_mode": "layer_norm",
+            "feature_grad_mult": 0.0,
+            "final_dim": 1024,
+            "label_rate": 50,
+            "latent_temp": [
+                2.0,
+                0.5,
+                0.999995
+            ],
+            "layer_norm_first": true,
+            "logit_temp": 0.1,
+            "mask_channel_length": 64,
+            "mask_channel_min_space": 1,
+            "mask_channel_other": 0.0,
+            "mask_channel_prob": 0.25,
+            "mask_channel_selection": "static",
+            "mask_length": 10,
+            "mask_min_space": 1,
+            "mask_other": 0.0,
+            "mask_prob": 0.5,
+            "mask_selection": "static",
+            "no_mask_channel_overlap": false,
+            "no_mask_overlap": false,
+            "skip_masked": false,
+            "skip_nomask": true,
+            "target_glu": false,
+            "untie_final_proj": true
+        },
+        "task": {
+            "_name": "hubert_pretraining",
+            "data": "/foo/bar",
+            "enable_padding": false,
+            "fine_tuning": false,
+            "label_dir": "/foo/bar",
+            "label_rate": 50,
+            "labels": [],
+            "max_sample_size": 250000,
+            "min_sample_size": 32000,
+            "normalize": true,
+            "pad_audio": false,
+            "random_crop": true,
+            "sample_rate": 16000,
+            "single_target": false
+        }
+    }
+}
--- a/test/torchaudio_unittest/models/wav2vec2/fairseq_integration_test.py
+++ b/test/torchaudio_unittest/models/wav2vec2/fairseq_integration_test.py
@@ -9,6 +9,11 @@ from torchaudio.models.wav2vec2 import (
    wav2vec2_base,
    wav2vec2_large,
    wav2vec2_large_lv60k,
+    hubert_base,
+    hubert_large,
+    hubert_xlarge,
+    hubert_asr_large,
+    hubert_asr_xlarge,
 )
 from torchaudio.models.wav2vec2.utils import (
    import_fairseq_model,
@@ -31,29 +36,51 @@ def _name_func(testcase_func, i, param):
    return f'{testcase_func.__name__}_{i}_{param[0][1].__name__}'


-# Pretraining (not fine-tuned) models
-BASE = _load_config('wav2vec_small')
-LARGE = _load_config('libri960_big')
-LARGE_LV60K = _load_config('wav2vec_vox_new')
-XLSR_53_56K = _load_config('xlsr_53_56k')
-# Fine-tuned models
-BASE_960H = _load_config('wav2vec_small_960h')
-LARGE_960H = _load_config('wav2vec_large_960h')
-LARGE_LV60K_960H = _load_config('wav2vec_large_lv60k_960h')
-LARGE_LV60K_SELF_960H = _load_config('wav2vec_large_lv60k_self_960h')
+# Pretraining models
+WAV2VEC2_BASE = _load_config('wav2vec_small')
+WAV2VEC2_LARGE = _load_config('libri960_big')
+WAV2VEC2_LARGE_LV60K = _load_config('wav2vec_vox_new')
+WAV2VEC2_XLSR_53_56K = _load_config('xlsr_53_56k')
+HUBERT_BASE = _load_config('hubert_base_ls960')
+HUBERT_LARGE_LL60K = _load_config('hubert_large_ll60k')
+HUBERT_XLARGE_LL60K = _load_config('hubert_large_ll60k')
+# Finetuning models
+WAV2VEC2_BASE_960H = _load_config('wav2vec_small_960h')
+WAV2VEC2_LARGE_960H = _load_config('wav2vec_large_960h')
+WAV2VEC2_LARGE_LV60K_960H = _load_config('wav2vec_large_lv60k_960h')
+WAV2VEC2_LARGE_LV60K_SELF_960H = _load_config('wav2vec_large_lv60k_self_960h')
+HUBERT_LARGE = _load_config('hubert_large_ll60k_finetune_ls960')
+HUBERT_XLARGE = _load_config('hubert_xtralarge_ll60k_finetune_ls960')
+

 # Config and corresponding factory functions
-PRETRAINING_CONFIGS = parameterized.expand([
-    (BASE, wav2vec2_base),
-    (LARGE, wav2vec2_large),
-    (LARGE_LV60K, wav2vec2_large_lv60k),
-    (XLSR_53_56K, wav2vec2_large_lv60k),
+WAV2VEC2_PRETRAINING_CONFIGS = parameterized.expand([
+    (WAV2VEC2_BASE, wav2vec2_base),
+    (WAV2VEC2_LARGE, wav2vec2_large),
+    (WAV2VEC2_LARGE_LV60K, wav2vec2_large_lv60k),
+    (WAV2VEC2_XLSR_53_56K, wav2vec2_large_lv60k),
+], name_func=_name_func)
+HUBERT_PRETRAINING_CONFIGS = parameterized.expand([
+    (HUBERT_BASE, hubert_base),
+    (HUBERT_LARGE_LL60K, hubert_large),
+    (HUBERT_XLARGE_LL60K, hubert_xlarge),
 ], name_func=_name_func)
-FINETUNED_CONFIGS = parameterized.expand([
-    (BASE_960H, wav2vec2_asr_base),
-    (LARGE_960H, wav2vec2_asr_large),
-    (LARGE_LV60K_960H, wav2vec2_asr_large_lv60k),
-    (LARGE_LV60K_SELF_960H, wav2vec2_asr_large_lv60k),
+ALL_PRETRAINING_CONFIGS = parameterized.expand([
+    (WAV2VEC2_BASE, wav2vec2_base),
+    (WAV2VEC2_LARGE, wav2vec2_large),
+    (WAV2VEC2_LARGE_LV60K, wav2vec2_large_lv60k),
+    (WAV2VEC2_XLSR_53_56K, wav2vec2_large_lv60k),
+    (HUBERT_BASE, hubert_base),
+    (HUBERT_LARGE_LL60K, hubert_large),
+    (HUBERT_XLARGE_LL60K, hubert_xlarge),
+], name_func=_name_func)
+FINETUNING_CONFIGS = parameterized.expand([
+    (WAV2VEC2_BASE_960H, wav2vec2_asr_base),
+    (WAV2VEC2_LARGE_960H, wav2vec2_asr_large),
+    (WAV2VEC2_LARGE_LV60K_960H, wav2vec2_asr_large_lv60k),
+    (WAV2VEC2_LARGE_LV60K_SELF_960H, wav2vec2_asr_large_lv60k),
+    (HUBERT_LARGE, hubert_asr_large),
+    (HUBERT_XLARGE, hubert_asr_xlarge),
 ], name_func=_name_func)


@@ -76,6 +103,15 @@ class TestFairseqIntegration(TorchaudioTestCase):
            Wav2VecEncoder,
            Wav2Vec2CtcConfig,
        )
+        from fairseq.models.hubert.hubert_asr import (
+            HubertCtcConfig,
+            HubertEncoder,
+        )
+        from fairseq.models.hubert.hubert import (
+            HubertModel,
+            HubertConfig,
+        )
+        from fairseq.tasks.hubert_pretraining import HubertPretrainingConfig

        if config['_name'] == 'wav2vec_ctc':
            config = copy.deepcopy(config)
@@ -83,10 +119,22 @@ class TestFairseqIntegration(TorchaudioTestCase):
            return Wav2VecEncoder(Wav2Vec2CtcConfig(**config), num_out)
        if config['_name'] == 'wav2vec2':
            return Wav2Vec2Model(Wav2Vec2Config(**config))
+        if config['_name'] == 'hubert_ctc':
+            config = copy.deepcopy(config)
+            config['w2v_args'] = OmegaConf.create(config['w2v_args'])
+            ctc_cfg = HubertCtcConfig(**config)
+            return HubertEncoder(ctc_cfg, tgt_dict=range(num_out))
+        if config['_name'] == 'hubert':
+            dicts = [list(range(i)) for i in config['num_classes']]
+            return HubertModel(
+                HubertConfig(**config['model']),
+                HubertPretrainingConfig(**config['task']),
+                dicts,
+            )
        raise ValueError(f'Unexpected configuration: {config["_name"]}')

-    @PRETRAINING_CONFIGS
-    def test_import_pretraining_model(self, config, _):
+    @WAV2VEC2_PRETRAINING_CONFIGS
+    def test_import_wave2vec2_pretraining_model(self, config, _):
        """Wav2vec2 pretraining models from fairseq can be imported and yields the same results"""
        batch_size, num_frames = 3, 1024
        atol = 1.1e-05 if sys.platform == "darwin" else 1e-05
@@ -106,7 +154,22 @@ class TestFairseqIntegration(TorchaudioTestCase):
        for i, (ref, _) in enumerate(refs['layer_results']):
            self.assertEqual(hyp[i], ref.transpose(0, 1), atol=atol, rtol=1.3e-06)

-    @PRETRAINING_CONFIGS
+    @HUBERT_PRETRAINING_CONFIGS
+    def test_import_hubert_pretraining_model(self, config, _):
+        """HuBERT pretraining models from fairseq can be imported and yields the same results"""
+        batch_size, num_frames = 3, 1024
+
+        original = self._get_model(config).eval()
+        imported = import_fairseq_model(original).eval()
+
+        x = torch.randn(batch_size, num_frames)
+        mask = torch.zeros_like(x)
+        hyp, _ = imported.extract_features(x)
+        for i in range(len(original.encoder.layers)):
+            ref, _ = original.extract_features(x, padding_mask=mask, output_layer=i + 1)
+            self.assertEqual(hyp[i], ref)
+
+    @ALL_PRETRAINING_CONFIGS
    def test_recreate_pretraining_model(self, config, factory_func):
        """Imported pretraining models can be recreated via a factory function without fairseq."""
        batch_size, num_frames = 3, 1024
@@ -131,8 +194,8 @@ class TestFairseqIntegration(TorchaudioTestCase):
        self.assertEqual(ref, hyp)
        self.assertEqual(ref_lengths, hyp_lengths)

-    @FINETUNED_CONFIGS
-    def test_import_finetuned_model(self, config, _):
+    @FINETUNING_CONFIGS
+    def test_import_finetuning_model(self, config, _):
        """Fintuned wav2vec2 models from fairseq can be imported and yields the same results"""
        num_out = 28
        batch_size, num_frames = 3, 1024
@@ -154,9 +217,9 @@ class TestFairseqIntegration(TorchaudioTestCase):
        for i, l in enumerate(output_lengths):
            self.assertEqual(ref[i, :l, ...], hyp[i, :l, ...])

-    @FINETUNED_CONFIGS
-    def test_recreate_finetuned_model(self, config, factory_func):
-        """Imported finetuned models can be recreated via a factory function without fairseq."""
+    @FINETUNING_CONFIGS
+    def test_recreate_finetuning_model(self, config, factory_func):
+        """Imported finetuning models can be recreated via a factory function without fairseq."""
        num_out = 28
        batch_size, num_frames = 3, 1024


--- a/test/torchaudio_unittest/models/wav2vec2/model_test.py
+++ b/test/torchaudio_unittest/models/wav2vec2/model_test.py
@@ -8,6 +8,11 @@ from torchaudio.models.wav2vec2 import (
    wav2vec2_base,
    wav2vec2_large,
    wav2vec2_large_lv60k,
+    hubert_base,
+    hubert_large,
+    hubert_xlarge,
+    hubert_asr_large,
+    hubert_asr_xlarge,
 )
 from torchaudio_unittest.common_utils import (
    TorchaudioTestCase,
@@ -26,6 +31,9 @@ pretrain_factory_funcs = parameterized.expand([
    (wav2vec2_base, ),
    (wav2vec2_large, ),
    (wav2vec2_large_lv60k, ),
+    (hubert_base, ),
+    (hubert_large, ),
+    (hubert_xlarge, ),
 ], name_func=_name_func)


@@ -33,6 +41,8 @@ finetune_factory_funcs = parameterized.expand([
    (wav2vec2_asr_base, ),
    (wav2vec2_asr_large, ),
    (wav2vec2_asr_large_lv60k, ),
+    (hubert_asr_large, ),
+    (hubert_asr_xlarge, ),
 ], name_func=_name_func)



--- a/torchaudio/models/__init__.py
+++ b/torchaudio/models/__init__.py
@@ -11,9 +11,13 @@ from .wav2vec2 import (
    wav2vec2_base,
    wav2vec2_large,
    wav2vec2_large_lv60k,
+    hubert_base,
+    hubert_large,
+    hubert_xlarge,
+    hubert_asr_large,
+    hubert_asr_xlarge,
 )

-
 __all__ = [
    'Wav2Letter',
    'WaveRNN',
@@ -27,6 +31,11 @@ __all__ = [
    'wav2vec2_base',
    'wav2vec2_large',
    'wav2vec2_large_lv60k',
+    'hubert_base',
+    'hubert_large',
+    'hubert_xlarge',
+    'hubert_asr_large',
+    'hubert_asr_xlarge',
    'Tacotron2',
    'tacotron2',
 ]
--- a/torchaudio/models/wav2vec2/__init__.py
+++ b/torchaudio/models/wav2vec2/__init__.py
@@ -6,6 +6,11 @@ from .model import (
    wav2vec2_base,
    wav2vec2_large,
    wav2vec2_large_lv60k,
+    hubert_base,
+    hubert_large,
+    hubert_xlarge,
+    hubert_asr_large,
+    hubert_asr_xlarge,
 )
 from . import utils

@@ -17,5 +22,10 @@ __all__ = [
    'wav2vec2_base',
    'wav2vec2_large',
    'wav2vec2_large_lv60k',
+    'hubert_base',
+    'hubert_large',
+    'hubert_xlarge',
+    'hubert_asr_large',
+    'hubert_asr_xlarge',
    'utils',
 ]
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -328,3 +328,157 @@ def wav2vec2_asr_large_lv60k(num_out: int) -> Wav2Vec2Model:
        encoder_layer_drop=0.1,
        aux_num_out=num_out,
    )
+
+
+def hubert_base() -> Wav2Vec2Model:
+    """Build HuBERT model with "Base" configuration
+
+    This is one of the model architectures used in *HuBERT*
+    [:footcite:`hsu2021hubert`] for pretraining.
+
+    Returns:
+        HuBERT: The resulting model.
+    """
+    return _get_model(
+        extractor_mode='group_norm',
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=0.1,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_attention_dropout=0.1,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=0.0,
+        encoder_dropout=0.1,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=0.05,
+        aux_num_out=None,
+    )
+
+
+def hubert_large() -> Wav2Vec2Model:
+    """Build HuBERT model with "Large" configuration
+
+    This is one of the model architectures used in *HuBERT*
+    [:footcite:`hsu2021hubert`] for pretraining.
+
+    Returns:
+        HuBERT: The resulting model.
+    """
+    return _get_model(
+        extractor_mode='layer_norm',
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=0.0,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=0.0,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=0.0,
+        encoder_dropout=0.0,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=0.0,
+        aux_num_out=None,
+    )
+
+
+def hubert_asr_large(num_out) -> Wav2Vec2Model:
+    """Build "Large" HuBERT model with an extra linear module
+
+
+    This is one of the model architecture used in *HuBERT*
+    [:footcite:`hsu2021hubert`] for fine-tuning for ASR task.
+
+    Args:
+        num_out: int
+            The number of output labels.
+
+    Returns:
+        Wav2Vec2Model:
+    """
+    return _get_model(
+        extractor_mode='layer_norm',
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=0.0,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=0.0,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=0.1,
+        encoder_dropout=0.0,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=0.1,
+        aux_num_out=num_out,
+    )
+
+
+def hubert_xlarge() -> Wav2Vec2Model:
+    """Build HuBERT model with "extra large" configuration
+
+    This is one of the model architectures used in *HuBERT*
+    [:footcite:`hsu2021hubert`] for pretraining.
+
+    Returns:
+        HuBERT: The resulting model.
+    """
+    return _get_model(
+        extractor_mode='layer_norm',
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=0.0,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=0.0,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=0.0,
+        encoder_dropout=0.0,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=0.0,
+        aux_num_out=None,
+    )
+
+
+def hubert_asr_xlarge(num_out) -> Wav2Vec2Model:
+    """Build "extra large" HuBERT model with an extra linear module
+
+    This is one of the model architecture used in *HuBERT*
+    [:footcite:`hsu2021hubert`] for fine-tuning for ASR task.
+
+    Args:
+        num_out: int
+            The number of output labels.
+
+    Returns:
+        Wav2Vec2Model: The resulting model.
+    """
+    return _get_model(
+        extractor_mode='layer_norm',
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1280,
+        encoder_projection_dropout=0.0,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=48,
+        encoder_num_heads=16,
+        encoder_attention_dropout=0.0,
+        encoder_ff_interm_features=5120,
+        encoder_ff_interm_dropout=0.1,
+        encoder_dropout=0.0,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=0.1,
+        aux_num_out=num_out,
+    )
--- a/torchaudio/models/wav2vec2/utils/import_fairseq.py
+++ b/torchaudio/models/wav2vec2/utils/import_fairseq.py
@@ -110,6 +110,9 @@ def _map_key(key):
    # Only relevant when loading fine-tuned models
    if match:
        return f"aux.{match.group(1)}"
+    # HuBERT Extension
+    if key in ['label_embs_concat']:
+        return key
    raise ValueError(f'Unexpected key: {key_}')


@@ -127,9 +130,10 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:

    Args:
        original (torch.nn.Module):
-            An instance of fairseq's Wav2Vec2.0 model class.
-            Either ``fairseq.models.wav2vec.wav2vec2_asr.Wav2VecEncoder`` or
-            ``fairseq.models.wav2vec.wav2vec2.Wav2Vec2Model``.
+            An instance of fairseq's Wav2Vec2.0 or HuBERT model.
+            One of ``fairseq.models.wav2vec.wav2vec2_asr.Wav2VecEncoder``,
+            ``fairseq.models.wav2vec.wav2vec2.Wav2Vec2Model`` or
+            ``fairseq.models.hubert.hubert_asr.HubertEncoder``.

    Returns:
        Wav2Vec2Model: Imported model.
@@ -173,21 +177,39 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:
    """
    class_ = original.__class__.__name__
    if class_ == 'Wav2Vec2Model':
-        return _import_pretrained(original)
+        return _import_wav2vec2_pretraining(original)
    if class_ == 'Wav2VecEncoder':
-        return _import_finetuned(original)
+        return _import_wav2vec2_finetuning(original)
+    if class_ == 'HubertModel':
+        return _import_hubert_pretraining(original)
+    if class_ == 'HubertEncoder':
+        return _import_hubert_finetuning(original)
    raise ValueError(
        f'Expected an instance of `Wav2Vec2Model` or `Wav2VecEncoder`. Found: {class_}')


-def _import_finetuned(original: Module) -> Wav2Vec2Model:
+def _import_wav2vec2_finetuning(original: Module) -> Wav2Vec2Model:
    config = _parse_config(original.w2v_model)
    model = _get_model(**config, aux_num_out=original.proj.out_features)
    model.load_state_dict(_convert_state_dict(original.state_dict()))
    return model


-def _import_pretrained(original: Module) -> Wav2Vec2Model:
+def _import_wav2vec2_pretraining(original: Module) -> Wav2Vec2Model:
+    config = _parse_config(original)
+    model = _get_model(**config, aux_num_out=None)
+    model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
+    return model
+
+
+def _import_hubert_finetuning(original: Module) -> Wav2Vec2Model:
+    config = _parse_config(original.w2v_model)
+    model = _get_model(**config, aux_num_out=original.proj.out_features)
+    model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)
+    return model
+
+
+def _import_hubert_pretraining(original: Module) -> Wav2Vec2Model:
    config = _parse_config(original)
    model = _get_model(**config, aux_num_out=None)
    model.load_state_dict(_convert_state_dict(original.state_dict()), strict=False)