UPDATE

ffeba11a · mayp777 · 29deb085 · ffeba11a · ffeba11a · ffeba11a
Commit ffeba11a authored Sep 02, 2024 by mayp777
20 changed files
--- a/test/integration_tests/prototype/vggish_pipeline_test.py
+++ b/test/integration_tests/prototype/vggish_pipeline_test.py
+import torchaudio
+from torchaudio.prototype.pipelines import VGGISH
+
+
+def test_vggish():
+    input_sr = VGGISH.sample_rate
+    input_proc = VGGISH.get_input_processor()
+    model = VGGISH.get_model()
+    path = torchaudio.utils.download_asset("test-assets/Chopin_Ballade_-1_In_G_Minor,_Op._23_excerpt.mp3")
+    waveform, sr = torchaudio.load(path, backend="ffmpeg")
+    waveform = waveform.mean(axis=0)
+    waveform = torchaudio.functional.resample(waveform, sr, input_sr)
+    batch = input_proc(waveform)
+    assert batch.shape == (62, 1, 96, 64)
+    output = model(batch)
+    assert output.shape == (62, 128)
--- a/test/integration_tests/rnnt_pipeline_test.py
+++ b/test/integration_tests/rnnt_pipeline_test.py
 import pytest
 import torchaudio
 from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
+from torchaudio.prototype.pipelines import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3


 @pytest.mark.parametrize(
    "bundle,lang,expected",
    [
        (EMFORMER_RNNT_BASE_LIBRISPEECH, "en", "i have that curiosity beside me at this moment"),
+        (EMFORMER_RNNT_BASE_MUSTC, "en", "I had that curiosity beside me at this moment."),
+        (EMFORMER_RNNT_BASE_TEDLIUM3, "en", "i had that curiosity beside me at this moment"),
    ],
 )
 def test_rnnt(bundle, sample_speech, expected):

--- a/test/integration_tests/squim_pipeline_test.py
+++ b/test/integration_tests/squim_pipeline_test.py
+import pytest
+import torchaudio
+from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
+
+
+@pytest.mark.parametrize(
+    "lang,expected",
+    [
+        ("en", [0.9978380799293518, 4.23893404006958, 24.217193603515625]),
+    ],
+)
+def test_squim_objective_pretrained_weights(lang, expected, sample_speech):
+    """Test that the metric scores estimated by SquimObjective Bundle is identical to the expected result."""
+    bundle = SQUIM_OBJECTIVE
+
+    # Get SquimObjective model
+    model = bundle.get_model()
+    # Create a synthetic waveform
+    waveform, sample_rate = torchaudio.load(sample_speech)
+    scores = model(waveform)
+    for i in range(3):
+        assert abs(scores[i].item() - expected[i]) < 1e-5
+
+
+@pytest.mark.parametrize(
+    "task,expected",
+    [
+        ("speech_separation", [3.9257140159606934, 3.9391300678253174]),
+    ],
+)
+def test_squim_subjective_pretrained_weights(task, expected, mixture_source, clean_sources):
+    """Test that the metric scores estimated by SquimSubjective Bundle is identical to the expected result."""
+    bundle = SQUIM_SUBJECTIVE
+
+    # Get SquimObjective model
+    model = bundle.get_model()
+    # Load input mixture audio
+    waveform, sample_rate = torchaudio.load(mixture_source)
+    for i, source in enumerate(clean_sources):
+        # Load clean reference
+        clean_waveform, sample_rate = torchaudio.load(source)
+        score = model(waveform, clean_waveform)
+        assert abs(score.item() - expected[i]) < 1e-5
--- a/test/integration_tests/wav2vec2_pipeline_test.py
+++ b/test/integration_tests/wav2vec2_pipeline_test.py
+import os
+
 import pytest
 import torchaudio
 from torchaudio.pipelines import (
@@ -24,6 +26,11 @@ from torchaudio.pipelines import (
    WAV2VEC2_LARGE,
    WAV2VEC2_LARGE_LV60K,
    WAV2VEC2_XLSR53,
+    WAV2VEC2_XLSR_1B,
+    WAV2VEC2_XLSR_300M,
+    WAVLM_BASE,
+    WAVLM_BASE_PLUS,
+    WAVLM_LARGE,
 )


@@ -37,6 +44,9 @@ from torchaudio.pipelines import (
        HUBERT_BASE,
        HUBERT_LARGE,
        HUBERT_XLARGE,
+        WAVLM_BASE,
+        WAVLM_BASE_PLUS,
+        WAVLM_LARGE,
    ],
 )
 def test_pretraining_models(bundle):
@@ -44,6 +54,19 @@ def test_pretraining_models(bundle):
    bundle.get_model()


+@pytest.mark.skipif("CI" not in os.environ, reason="Run tests only in CI environment.")
+@pytest.mark.parametrize(
+    "bundle",
+    [
+        WAV2VEC2_XLSR_300M,
+        WAV2VEC2_XLSR_1B,
+    ],
+)
+def test_xlsr_pretraining_models(bundle):
+    """Smoke test of downloading weights for pretraining models"""
+    bundle.get_model()
+
+
 @pytest.mark.parametrize(
    "bundle,lang,expected",
    [
@@ -53,7 +76,7 @@ def test_pretraining_models(bundle):
        (WAV2VEC2_ASR_LARGE_10M, "en", "I|HAD|THAT|CURIOUSITY|BESIDE|ME|AT|THIS|MOMENT|"),
        (WAV2VEC2_ASR_LARGE_100H, "en", "I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"),
        (WAV2VEC2_ASR_LARGE_960H, "en", "I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"),
-        (WAV2VEC2_ASR_LARGE_LV60K_10M, "en", "I|HAD|THAT|CURIOUSSITY|BESID|ME|AT|THISS|MOMENT|"),
+        (WAV2VEC2_ASR_LARGE_LV60K_10M, "en", "I|HAD|THAT|CURIOUSITY|BESID|ME|AT|THISS|MOMENT|"),
        (WAV2VEC2_ASR_LARGE_LV60K_100H, "en", "I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"),
        (WAV2VEC2_ASR_LARGE_LV60K_960H, "en", "I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"),
        (HUBERT_ASR_LARGE, "en", "I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"),

--- a/test/smoke_test/smoke_test.py
+++ b/test/smoke_test/smoke_test.py
+#!/usr/bin/env python3
 """Run smoke tests"""
+import argparse
+import logging

-import torchaudio  # noqa: F401
-import torchaudio.compliance.kaldi  # noqa: F401
-import torchaudio.datasets  # noqa: F401
-import torchaudio.functional  # noqa: F401
-import torchaudio.models  # noqa: F401
-import torchaudio.pipelines  # noqa: F401
-import torchaudio.sox_effects  # noqa: F401
-import torchaudio.transforms  # noqa: F401
-import torchaudio.utils  # noqa: F401
-from torchaudio.io import StreamReader  # noqa: F401
+
+def base_smoke_test():
+    import torchaudio  # noqa: F401
+    import torchaudio.compliance.kaldi  # noqa: F401
+    import torchaudio.datasets  # noqa: F401
+    import torchaudio.functional  # noqa: F401
+    import torchaudio.models  # noqa: F401
+    import torchaudio.pipelines  # noqa: F401
+    import torchaudio.sox_effects  # noqa: F401
+    import torchaudio.transforms  # noqa: F401
+    import torchaudio.utils  # noqa: F401
+
+
+def ffmpeg_test():
+    from torchaudio.io import StreamReader  # noqa: F401
+
+
+def _run_smoke_test(check_ffmpeg):
+    base_smoke_test()
+
+    if not check_ffmpeg:
+        print("Skipping ffmpeg test.")
+    else:
+        ffmpeg_test()
+
+    print("Smoke test passed.")
+
+
+def main(args=None) -> None:
+    options = _parse_args(args)
+
+    if options.debug:
+        logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG)
+
+    _chdir()
+    _run_smoke_test(options.ffmpeg)
+
+
+def _parse_args(args):
+    parser = argparse.ArgumentParser()
+
+    # Warning: Please note this option should not be widely used, only use it when absolutely necessary
+    parser.add_argument("--no-ffmpeg", dest="ffmpeg", action="store_false")
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging.")
+
+    return parser.parse_args(args)
+
+
+def _chdir():
+    # smoke test should not be performed on the root directory of checked out source code.
+    import os
+    from pathlib import Path
+
+    os.chdir(Path(__file__).parent)
+    assert "torchaudio" not in os.listdir(os.getcwd())
+
+
+if __name__ == "__main__":
+    main()
--- a/test/smoke_test/smoke_test_no_ffmpeg.py
+++ b/test/smoke_test/smoke_test_no_ffmpeg.py
+from smoke_test import main
+
+main(["--no-ffmpeg"])
--- a/test/tools/make_test_env.sh
+++ b/test/tools/make_test_env.sh
+# 更新系统
+sudo apt-get update
+sudo apt-get upgrade -y
+
+# 安装必要的依赖 liblame-dev -> libmp3lame-dev
+sudo apt-get install -y libmad0 libmad0-dev libid3tag0 libid3tag0-dev libmp3lame-dev libflac-dev libvorbis-dev yasm nasm
+
+# 安装 Python 包
+pip3 install pytorch-lightning==1.9.3 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install unidecode -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install inflect -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install pytest==7.3.1 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install expecttest -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install parameterized -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install scipy -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install scikit-learn -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install urllib3==1.26.14 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install SentencePiece -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install deep-phonemizer -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install librosa -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+# pip install demucs -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install Pillow -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install flashlight -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install kaldi_io -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install flashlight-text -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install tinytag -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+
+# 设置 PKG_CONFIG_PATH
+export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
+
+# ffmpeg ffmpeg-4.1.11 -> ffmpeg-4.4.4  --enable-libmp3lame
+wget --no-check-certificate https://www.ffmpeg.org/releases/ffmpeg-4.4.4.tar.gz
+tar xzf ffmpeg-4.4.4.tar.gz
+cd ffmpeg-4.4.4
+./configure --enable-shared --enable-libmp3lame
+make -j32
+sudo make install
+cd -
+
+# opencore-amr-0.1.6
+wget --no-check-certificate https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.6.tar.gz
+tar xzf opencore-amr-0.1.6.tar.gz
+cd opencore-amr-0.1.6
+./configure
+make -j32
+sudo make install
+cd -
+
+# amrnb-11.0.0.0
+#wget http://www.penguin.cz/~utx/ftp/amr/amrnb-11.0.0.0.tar.bz2
+#tar xf amrnb-11.0.0.0.tar.bz2
+#cd amrnb-11.0.0.0
+#./configure
+#make -j32
+#sudo make install
+#cd -
+
+# amrwb-11.0.0.0
+#wget http://www.penguin.cz/~utx/ftp/amr/amrwb-11.0.0.0.tar.bz2
+#tar xf amrwb-11.0.0.0.tar.bz2
+#cd amrwb-11.0.0.0
+#./configure
+#make -j32
+#sudo make install
+#cd -
+
+# libao-1.2.0
+wget http://downloads.xiph.org/releases/ao/libao-1.2.0.tar.gz
+tar xzf libao-1.2.0.tar.gz
+cd libao-1.2.0
+./configure
+make -j32
+sudo make install
+cd -
+
+# libogg-1.3.5
+wget http://downloads.xiph.org/releases/ogg/libogg-1.3.5.tar.xz
+tar xf libogg-1.3.5.tar.xz
+cd libogg-1.3.5
+./configure --disable-static
+make -j32
+sudo make install
+cd -
+
+# opus-1.4
+wget --no-check-certificate http://downloads.xiph.org/releases/opus/opus-1.4.tar.gz
+tar xzf opus-1.4.tar.gz
+cd opus-1.4
+./configure
+make -j32
+sudo make install
+cd -
+
+# opusfile-0.12
+wget http://downloads.xiph.org/releases/opus/opusfile-0.12.tar.gz
+tar xzf opusfile-0.12.tar.gz
+cd opusfile-0.12
+./configure
+make -j32
+sudo make install
+cd -
+
+# libopusenc-0.2.1
+wget --no-check-certificate https://archive.mozilla.org/pub/opus/libopusenc-0.2.1.tar.gz
+tar xzf libopusenc-0.2.1.tar.gz
+cd libopusenc-0.2.1
+./configure
+make -j32
+sudo make install
+cd -
+
+# opus-tools-0.2
+# wget https://archive.mozilla.org/pub/opus/opus-tools-0.2.tar.gz
+# tar xzf opus-tools-0.2.tar.gz
+# cd opus-tools-0.2
+# ./configure
+# make -j32
+# sudo make install
+# cd -
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+# wavpack-4.80.0
+wget --no-check-certificate https://www.wavpack.com/wavpack-4.80.0.tar.bz2
+tar -jxf wavpack-4.80.0.tar.bz2
+cd wavpack-4.80.0
+./configure
+make -j32
+sudo make install
+cd -
+
+# sox-14.4.2
+wget --no-check-certificate https://sourceforge.net/projects/sox/files/sox/14.4.2/sox-14.4.2.tar.gz
+tar xzf sox-14.4.2.tar.gz
+cd sox-14.4.2
+./configure
+make -j32
+sudo make install
+cd -
--- a/test/tools/rocky8_make_test_env.sh
+++ b/test/tools/rocky8_make_test_env.sh
+# 更新系统
+sudo dnf update --assumeno
+sudo dnf clean all
+sudo dnf makecache
+sudo dnf install epel-release -y
+
+# 安装必要的依赖 
+sudo dnf install -y \
+  libmad \
+  libmad-devel \
+  libid3tag \
+  libid3tag-devel \
+  lame-devel \
+  flac \
+  flac-devel \
+  libvorbis \
+  libvorbis-devel \
+  opus \
+  opus-devel \
+  yasm \
+  nasm
+
+
+# 安装 Python 包
+pip install numpy==1.24.3 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install pytorch-lightning==1.9.3 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install unidecode -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install inflect -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install pytest==7.3.1 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install expecttest -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install parameterized -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install scipy -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install scikit-learn -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip3 install urllib3==1.26.14 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install SentencePiece -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install deep-phonemizer -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install librosa -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install Pillow -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install flashlight -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install kaldi_io -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install flashlight-text -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+pip install tinytag -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
+
+# 设置 PKG_CONFIG_PATH
+export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
+
+# ffmpeg ffmpeg-4.1.11 -> ffmpeg-4.4.4  --enable-libmp3lame
+wget --no-check-certificate https://www.ffmpeg.org/releases/ffmpeg-4.4.4.tar.gz
+tar xzf ffmpeg-4.4.4.tar.gz
+cd ffmpeg-4.4.4
+./configure --enable-shared --enable-libmp3lame
+make -j32
+sudo make install
+cd -
+
+# opencore-amr-0.1.6
+wget --no-check-certificate https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.6.tar.gz
+tar xzf opencore-amr-0.1.6.tar.gz
+cd opencore-amr-0.1.6
+./configure
+make -j32
+sudo make install
+cd -
+
+# libao-1.2.0
+wget http://downloads.xiph.org/releases/ao/libao-1.2.0.tar.gz
+tar xzf libao-1.2.0.tar.gz
+cd libao-1.2.0
+./configure
+make -j32
+sudo make install
+cd -
+
+# libogg-1.3.5
+wget http://downloads.xiph.org/releases/ogg/libogg-1.3.5.tar.xz
+tar xf libogg-1.3.5.tar.xz
+cd libogg-1.3.5
+./configure --disable-static
+make -j32
+sudo make install
+cd -
+
+# opus-1.4
+wget --no-check-certificate http://downloads.xiph.org/releases/opus/opus-1.4.tar.gz
+tar xzf opus-1.4.tar.gz
+cd opus-1.4
+./configure
+make -j32
+sudo make install
+cd -
+
+# opusfile-0.12
+wget http://downloads.xiph.org/releases/opus/opusfile-0.12.tar.gz
+tar xzf opusfile-0.12.tar.gz
+cd opusfile-0.12
+./configure
+make -j32
+sudo make install
+cd -
+
+# libopusenc-0.2.1
+wget --no-check-certificate https://archive.mozilla.org/pub/opus/libopusenc-0.2.1.tar.gz
+tar xzf libopusenc-0.2.1.tar.gz
+cd libopusenc-0.2.1
+./configure
+make -j32
+sudo make install
+cd -
+
+# opus-tools-0.2
+wget https://archive.mozilla.org/pub/opus/opus-tools-0.2.tar.gz
+tar xzf opus-tools-0.2.tar.gz
+cd opus-tools-0.2
+./configure
+make -j32
+sudo make install
+cd -
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+# wavpack-4.80.0
+wget --no-check-certificate https://www.wavpack.com/wavpack-4.80.0.tar.bz2
+tar -jxf wavpack-4.80.0.tar.bz2
+cd wavpack-4.80.0
+./configure
+make -j32
+sudo make install
+cd -
+
+# sox-14.4.2
+wget --no-check-certificate https://sourceforge.net/projects/sox/files/sox/14.4.2/sox-14.4.2.tar.gz
+tar xzf sox-14.4.2.tar.gz
+cd sox-14.4.2
+./configure
+make -j32
+sudo make install
+cd -
--- a/test/torchaudio_unittest/assets/RATRACE_wave_f_nm_np1_fr_goo_37.avi
+++ b/test/torchaudio_unittest/assets/RATRACE_wave_f_nm_np1_fr_goo_37.avi
--- a/test/torchaudio_unittest/assets/README.md
+++ b/test/torchaudio_unittest/assets/README.md
+* RATRACE_wave_f_nm_np1_fr_goo_37.avi
+  * Source: HMDB-51 dataset ("wave" subset)
+    https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/#Downloads
+  * License: Creative Commons Attribution 4.0 International License.
+  * Note: This file does not have proper PTS values thus useful for testing seek for such files.
--- a/test/torchaudio_unittest/assets/nasa_13013.avi
+++ b/test/torchaudio_unittest/assets/nasa_13013.avi
--- a/test/torchaudio_unittest/assets/testsrc.hevc
+++ b/test/torchaudio_unittest/assets/testsrc.hevc
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
@@ -76,8 +76,9 @@ def _main():
        conf = cfg["model"]
        del conf["w2v_path"]
        keep = ["_name", "task", "model"]
-        for key in list(k for k in conf["w2v_args"] if k not in keep):
-            del conf["w2v_args"][key]
+        for key in conf["w2v_args"]:
+            if key not in keep:
+                del conf["w2v_args"][key]
        conf["data"] = "/foo/bar/"
        conf["w2v_args"]["task"]["data"] = "/foo/bar"
        conf["w2v_args"]["task"]["labels"] = []

--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_1b.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_1b.json
+{
+    "_name": "wav2vec2",
+    "extractor_mode": "layer_norm",
+    "encoder_layers": 48,
+    "encoder_embed_dim": 1280,
+    "encoder_ffn_embed_dim": 5120,
+    "encoder_attention_heads": 16,
+    "activation_fn": "gelu",
+    "dropout": 0.0,
+    "attention_dropout": 0.0,
+    "activation_dropout": 0.0,
+    "encoder_layerdrop": 0.0,
+    "dropout_input": 0.1,
+    "dropout_features": 0.1,
+    "final_dim": 1024,
+    "layer_norm_first": true,
+    "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+    "conv_bias": true,
+    "logit_temp": 0.1,
+    "quantize_targets": true,
+    "quantize_input": false,
+    "same_quantizer": false,
+    "target_glu": false,
+    "feature_grad_mult": 1.0,
+    "latent_vars": 320,
+    "latent_groups": 2,
+    "latent_dim": 0,
+    "mask_length": 10,
+    "mask_prob": 0.65,
+    "mask_selection": "static",
+    "mask_other": 0.0,
+    "no_mask_overlap": false,
+    "mask_min_space": 1,
+    "mask_channel_length": 10,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_channel_other": 0.0,
+    "no_mask_channel_overlap": false,
+    "mask_channel_min_space": 1,
+    "num_negatives": 100,
+    "negatives_from_everywhere": false,
+    "cross_sample_negatives": 0,
+    "codebook_negatives": 0,
+    "conv_pos": 128,
+    "conv_pos_groups": 16,
+    "latent_temp": [
+        2.0,
+        0.1,
+        0.999995
+    ]
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_2b.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_2b.json
+{
+    "_name": "wav2vec2",
+    "extractor_mode": "layer_norm",
+    "encoder_layers": 48,
+    "encoder_embed_dim": 1920,
+    "encoder_ffn_embed_dim": 7680,
+    "encoder_attention_heads": 16,
+    "activation_fn": "gelu",
+    "dropout": 0.0,
+    "attention_dropout": 0.0,
+    "activation_dropout": 0.0,
+    "encoder_layerdrop": 0.0,
+    "dropout_input": 0.1,
+    "dropout_features": 0.1,
+    "final_dim": 1024,
+    "layer_norm_first": true,
+    "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+    "conv_bias": true,
+    "logit_temp": 0.1,
+    "quantize_targets": true,
+    "quantize_input": false,
+    "same_quantizer": false,
+    "target_glu": false,
+    "feature_grad_mult": 1.0,
+    "latent_vars": 320,
+    "latent_groups": 2,
+    "latent_dim": 0,
+    "mask_length": 10,
+    "mask_prob": 0.65,
+    "mask_selection": "static",
+    "mask_other": 0.0,
+    "no_mask_overlap": false,
+    "mask_min_space": 1,
+    "mask_channel_length": 10,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_channel_other": 0.0,
+    "no_mask_channel_overlap": false,
+    "mask_channel_min_space": 1,
+    "num_negatives": 100,
+    "negatives_from_everywhere": false,
+    "cross_sample_negatives": 0,
+    "codebook_negatives": 0,
+    "conv_pos": 128,
+    "conv_pos_groups": 16,
+    "latent_temp": [
+        2.0,
+        0.1,
+        0.999995
+    ]
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_300m.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_300m.json
+{
+    "_name": "wav2vec2",
+    "extractor_mode": "layer_norm",
+    "encoder_layers": 24,
+    "encoder_embed_dim": 1024,
+    "encoder_ffn_embed_dim": 4096,
+    "encoder_attention_heads": 16,
+    "activation_fn": "gelu",
+    "dropout": 0.0,
+    "attention_dropout": 0.0,
+    "activation_dropout": 0.0,
+    "encoder_layerdrop": 0.0,
+    "dropout_input": 0.0,
+    "dropout_features": 0.0,
+    "final_dim": 768,
+    "layer_norm_first": true,
+    "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+    "conv_bias": true,
+    "logit_temp": 0.1,
+    "quantize_targets": true,
+    "quantize_input": false,
+    "same_quantizer": false,
+    "target_glu": false,
+    "feature_grad_mult": 1.0,
+    "latent_vars": 320,
+    "latent_groups": 2,
+    "latent_dim": 0,
+    "mask_length": 10,
+    "mask_prob": 0.65,
+    "mask_selection": "static",
+    "mask_other": 0.0,
+    "no_mask_overlap": false,
+    "mask_min_space": 1,
+    "mask_channel_length": 10,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_channel_other": 0.0,
+    "no_mask_channel_overlap": false,
+    "mask_channel_min_space": 1,
+    "num_negatives": 100,
+    "negatives_from_everywhere": false,
+    "cross_sample_negatives": 0,
+    "codebook_negatives": 0,
+    "conv_pos": 128,
+    "conv_pos_groups": 16,
+    "latent_temp": [
+        2.0,
+        0.1,
+        0.999995
+    ]
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/huggingface/wav2vec2-xls-r-1b.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/huggingface/wav2vec2-xls-r-1b.json
+{
+    "activation_dropout": 0.0,
+    "apply_spec_augment": true,
+    "architectures": [
+      "Wav2Vec2Model"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "codevector_dim": 1024,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": true,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": true,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_dropout": 0.0,
+    "feat_extract_norm": "layer",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 1280,
+    "initializer_range": 0.02,
+    "intermediate_size": 5120,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "mask_feature_length": 10,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_prob": 0.075,
+    "model_type": "wav2vec2",
+    "num_attention_heads": 16,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 48,
+    "num_negatives": 100,
+    "pad_token_id": 0,
+    "proj_codevector_dim": 1024,
+    "torch_dtype": "float32",
+    "transformers_version": "4.12.0.dev0",
+    "use_weighted_layer_sum": false
+  }
--- a/test/torchaudio_unittest/assets/wav2vec2/huggingface/wav2vec2-xls-r-2b.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/huggingface/wav2vec2-xls-r-2b.json
+{
+    "activation_dropout": 0.0,
+    "apply_spec_augment": true,
+    "architectures": [
+      "Wav2Vec2Model"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "codevector_dim": 1024,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": true,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": true,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_dropout": 0.0,
+    "feat_extract_norm": "layer",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 1920,
+    "initializer_range": 0.02,
+    "intermediate_size": 7680,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "mask_feature_length": 10,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_prob": 0.075,
+    "model_type": "wav2vec2",
+    "num_attention_heads": 16,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 48,
+    "num_negatives": 100,
+    "pad_token_id": 0,
+    "proj_codevector_dim": 1024,
+    "torch_dtype": "float32",
+    "transformers_version": "4.12.0.dev0"
+  }
--- a/test/torchaudio_unittest/assets/wav2vec2/huggingface/wav2vec2-xls-r-300m.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/huggingface/wav2vec2-xls-r-300m.json
+{
+    "activation_dropout": 0.0,
+    "apply_spec_augment": true,
+    "architectures": [
+      "Wav2Vec2Model"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "codevector_dim": 768,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": true,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": true,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_dropout": 0.0,
+    "feat_extract_norm": "layer",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "mask_feature_length": 10,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_prob": 0.075,
+    "model_type": "wav2vec2",
+    "num_attention_heads": 16,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 24,
+    "num_negatives": 100,
+    "pad_token_id": 0,
+    "proj_codevector_dim": 768,
+    "torch_dtype": "float32",
+    "transformers_version": "4.12.0.dev0",
+    "use_weighted_layer_sum": false
+  }
--- a/test/torchaudio_unittest/assets/wav2vec2/huggingface/wavlm-base.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/huggingface/wavlm-base.json
+{
+    "activation_dropout": 0.0,
+    "adapter_kernel_size": 3,
+    "adapter_stride": 2,
+    "add_adapter": false,
+    "apply_spec_augment": true,
+    "architectures": [
+      "WavLMModel"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "classifier_proj_size": 256,
+    "codevector_dim": 256,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": false,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": false,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_norm": "group",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "freeze_feat_extract_train": true,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.05,
+    "mask_channel_length": 10,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_min_space": 1,
+    "mask_time_other": 0.0,
+    "mask_time_prob": 0.05,
+    "mask_time_selection": "static",
+    "max_bucket_distance": 800,
+    "model_type": "wavlm",
+    "no_mask_channel_overlap": false,
+    "no_mask_time_overlap": false,
+    "num_adapter_layers": 3,
+    "num_attention_heads": 12,
+    "num_buckets": 320,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_ctc_classes": 80,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 12,
+    "num_negatives": 100,
+    "output_hidden_size": 768,
+    "pad_token_id": 0,
+    "proj_codevector_dim": 256,
+    "tokenizer_class": "Wav2Vec2CTCTokenizer",
+    "torch_dtype": "float32",
+    "transformers_version": "4.15.0.dev0",
+    "use_weighted_layer_sum": false,
+    "vocab_size": 32
+  }